diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d46ae1894..76b18cdbe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,40 +8,52 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + with: + submodules: recursive - name: make run: | sudo apt-get update - sudo apt-get -y install uuid-dev libcurl4-openssl-dev - make KEYDB_CFLAGS='-Werror' KEYDB_CXXFLAGS='-Werror' BUILD_TLS=yes -j2 + sudo apt-get -y remove libzstd || true + sudo apt-get -y install uuid-dev libcurl4-openssl-dev libbz2-dev zlib1g-dev libsnappy-dev liblz4-dev libzstd-dev libgflags-dev + make BUILD_TLS=yes -j2 KEYDB_CFLAGS='-Werror' KEYDB_CXXFLAGS='-Werror' - name: gen-cert run: ./utils/gen-test-certs.sh - name: test-tls run: | sudo apt-get -y install tcl tcl-tls - ./runtest --clients 2 --verbose --tls + ./runtest --clients 1 --verbose --tls --config server-threads 3 - name: cluster-test run: | - ./runtest-cluster --tls + ./runtest-cluster --tls --config server-threads 3 - name: sentinel test run: | ./runtest-sentinel - name: module tests run: | ./runtest-moduleapi + - name: rotation test + run: | + ./runtest-rotation build-ubuntu-old: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 + with: + submodules: recursive - name: make -j2 run: | sudo apt-get update - sudo apt-get -y install uuid-dev libcurl4-openssl-dev + sudo apt-get -y remove libzstd || true + sudo apt-get -y install uuid-dev libcurl4-openssl-dev libbz2-dev zlib1g-dev libsnappy-dev liblz4-dev libzstd-dev libgflags-dev make -j2 + build-macos-latest: runs-on: macos-latest steps: - uses: actions/checkout@v2 + with: + submodules: recursive - name: make run: make KEYDB_CFLAGS='-Werror' KEYDB_CXXFLAGS='-Werror' -j2 @@ -49,18 +61,11 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + with: + submodules: recursive - name: make run: | sudo apt-get update - sudo apt-get -y install uuid-dev libcurl4-openssl-dev + sudo apt-get -y remove libzstd || true + sudo apt-get -y install uuid-dev libcurl4-openssl-dev libbz2-dev zlib1g-dev libsnappy-dev liblz4-dev libzstd-dev libgflags-dev make KEYDB_CFLAGS='-Werror' KEYDB_CXXFLAGS='-Werror' MALLOC=libc -j2 - build-ubuntu-32bit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: make - run: | - sudo dpkg --add-architecture i386 - sudo apt-get update - sudo apt-get -y install gcc-multilib g++-multilib libc6-dev-i386 lib32z1 uuid-dev:i386 libcurl4-openssl-dev:i386 - make KEYDB_CFLAGS='-Werror' KEYDB_CXXFLAGS='-Werror' 32bit -j2 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index fdfe4e122..000000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,164 +0,0 @@ -.standard-pipeline: - rules: - - if: '$COVERAGE' - when: never - - if: '$ENDURANCE' - when: never - - if: '$CI_PIPELINE_SOURCE == "push"' - -build: - extends: .standard-pipeline - tags: - - docker - stage: build - script: - - git submodule update --init - - make distclean - - make -j - artifacts: - paths: - - src/ - -runtest: - extends: .standard-pipeline - dependencies: - - build - tags: - - docker - stage: test - script: - - ./runtest --config server-threads 3 - -runtest-cluster: - extends: .standard-pipeline - dependencies: - - build - tags: - - docker - stage: test - script: - - ./runtest-cluster - -runtest-moduleapi: - extends: .standard-pipeline - dependencies: - - build - tags: - - docker - stage: test - script: - - ./runtest-moduleapi - -runtest-sentinel: - extends: .standard-pipeline - dependencies: - - build - tags: - - docker - stage: test - script: - - ./runtest-sentinel - -node-redis-test: - extends: .standard-pipeline - dependencies: - - build - rules: - - when: never - tags: - - docker - - ipv6 - stage: test - script: - - cp -pf src/keydb-server /usr/local/bin - - cp -pf src/keydb-cli /usr/local/bin - - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/node-redis.git - - cd node-redis - - npm install - - npm run test - -jedis-test: - extends: .standard-pipeline - dependencies: - - build - tags: - - docker - - ipv4 - stage: test - script: - - cp -pf src/keydb-server /usr/local/bin - - cp -pf src/keydb-cli /usr/local/bin - - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/jedis.git - - cd jedis - - make test - -redis-rs-test: - extends: .standard-pipeline - dependencies: - - build - tags: - - docker - stage: test - script: - - cp -pf src/keydb-server /usr/local/bin - - cp -pf src/keydb-cli /usr/local/bin - - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/redis-rs.git - - cd redis-rs - - make test - -endurance-test: - rules: - - if: '$ENDURANCE' - tags: - - docker - stage: test - script: - - git submodule update --init - - make distclean - - make -j - - ./runtest --config server-threads 3 --loop --stop - -coverage-test: - rules: - - if: '$COVERAGE' - tags: - - docker - stage: test - script: - - git submodule update --init - - make distclean - - make gcov -j - - make install - - ./runtest || true - - pkill keydb-server || true - - pkill stunnel || true - - ./runtest-cluster || true - - pkill keydb-server || true - - pkill stunnel || true - - ./runtest-sentinel || true - - pkill keydb-server || true - - pkill stunnel || true - - ./runtest-moduleapi || true - - pkill keydb-server || true - - pkill stunnel || true - - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/redis-rs.git - - cd redis-rs - - make test || true - - pkill keydb-server || true - - pkill stunnel || true - - cd .. - - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/jedis.git - - cd jedis - - make test || true - - pkill keydb-server || true - - pkill stunnel || true - - cd .. - - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/node-redis.git - - cd node-redis - - npm install - - npm run test || true - - pkill keydb-server || true - - pkill stunnel || true - - cd .. - - geninfo -o KeyDB.info --no-external . - - genhtml --legend -o lcov-html KeyDB.info \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..5e2cd9f46 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,98 @@ +cmake_minimum_required(VERSION 3.15) + +project(keydb VERSION 6.3.3 LANGUAGES C CXX) + +#===============# +# Compiler args # +#===============# + +set(CMAKE_C_VISIBILITY_PRESET hidden) +set(CMAKE_VISIBILITY_INLINES_HIDDEN YES) +set(CMAKE_C_STANDARD 99 CACHE STRING "") + +add_library("${PROJECT_NAME}_compiler_flags" INTERFACE) +target_compile_features("${PROJECT_NAME}_compiler_flags" INTERFACE "c_std_${CMAKE_C_STANDARD}") + +# add compiler warning flags just when building this project via +# the BUILD_INTERFACE genex +set(gcc_like "$") +set(msvc "$") +target_compile_options( + "${PROJECT_NAME}_compiler_flags" + INTERFACE + "$<${gcc_like}:$>" + "$<${msvc}:$>" +) + +# Set the build directories +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib") + +#===========# +# Configure # +#===========# + +# configure a header file to pass the version number only +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.h.in" + "${PROJECT_NAME}Config.h" +) + +#=============# +# Sub-targets # +#=============# + +add_subdirectory("src") + +include(CTest) +if (BUILD_TESTING) + #add_subdirectory("tests") +endif (BUILD_TESTING) + +#=========# +# Install # +#=========# + +include(GNUInstallDirs) + +install( + FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.h" + TYPE "INCLUDE" +) +include(InstallRequiredSystemLibraries) +set(CPACK_BUNDLE_NAME "${PROJECT_NAME}") +set(CPACK_PACKAGE_VENDOR "SamuelMarks") +set(CPACK_PACKAGE_DESCRIPTION "UTF-8 replacement for C strings, supporting zero-copy use-cases (non-null-terminated).") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "${CPACK_PACKAGE_DESCRIPTION_SUMMARY}") +if (APPLE) + set(CPACK_BUNDLE_PLIST "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Info.plist") + set(CPACK_BUNDLE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Info.plist") + set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/cmake/CustomVolumeIcon.icns") +endif (APPLE) +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/COPYING") +set(CPACK_PACKAGE_VERSION_MAJOR "${${PROJECT_NAME}_VERSION_MAJOR}") +set(CPACK_PACKAGE_VERSION_MINOR "${${PROJECT_NAME}_VERSION_MINOR}") +set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/cmake/README.txt") +set(CPACK_RESOURCE_FILE_WELCOME "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Welcome.txt") +set(CPACK_PACKAGE_CONTACT "https://github.com/Snapchat/KeyDB") + +include(CPack) +include(CMakePackageConfigHelpers) + +# generate the config file that is includes the exports +configure_package_config_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Config.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" + INSTALL_DESTINATION "${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}" + NO_SET_AND_CHECK_MACRO + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) + +# generate the version file for the config file +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" + VERSION "${${PROJECT_NAME}_VERSION_MAJOR}.${${PROJECT_NAME}_VERSION_MINOR}" + COMPATIBILITY AnyNewerVersion +) +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" + DESTINATION "${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}") diff --git a/README.md b/README.md index f69ef58ba..20adc97fe 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ##### KeyDB is now a part of Snap Inc! Check out the announcement [here](https://docs.keydb.dev/news/2022/05/12/keydb-joins-snap) -##### [Release v6.3.0](https://github.com/EQ-Alpha/KeyDB/releases/tag/v6.3.0) is here with major improvements as we consolodate our Open Source and Enterprise offerings into a single BSD-3 licensed project. See our [roadmap](https://docs.keydb.dev/docs/coming-soon) for details. +##### [Release v6.3.0](https://github.com/EQ-Alpha/KeyDB/releases/tag/v6.3.0) is here with major improvements as we consolidate our Open Source and Enterprise offerings into a single BSD-3 licensed project. See our [roadmap](https://docs.keydb.dev/docs/coming-soon) for details. ##### Want to extend KeyDB with Javascript? Try [ModJS](https://github.com/JohnSully/ModJS) @@ -37,7 +37,7 @@ Because of this difference of opinion features which are right for KeyDB may not Project Support ------------------- -The KeyDB team maintains this project as part of Snap Inc. KeyDB is used by Snap as part of its caching infrastructure and is fully open sourced. There is no separate commercial product and no paid support options available. We really value collaborating with the open source community and welcome PRs, bug reports, and open discussion. For community support or to get involved further with the project check out our community support options [here](https://docs.keydb.dev/docs/support) (slack, forum, meetup, github issues). Our team monitors these channlels regularly. +The KeyDB team maintains this project as part of Snap Inc. KeyDB is used by Snap as part of its caching infrastructure and is fully open sourced. There is no separate commercial product and no paid support options available. We really value collaborating with the open source community and welcome PRs, bug reports, and open discussion. For community support or to get involved further with the project check out our community support options [here](https://docs.keydb.dev/docs/support) (slack, forum, meetup, github issues). Our team monitors these channels regularly. Additional Resources @@ -104,6 +104,12 @@ Avoid forwarding RREPLAY messages to other masters? WARNING: This setting is dan If you would like KeyDB to dump and load directly to AWS S3 this option specifies the bucket. Using this option with the traditional RDB options will result in KeyDB backing up twice to both locations. If both are specified KeyDB will first attempt to load from the local dump file and if that fails load from S3. This requires the AWS CLI tools to be installed and configured which are used under the hood to transfer the data. +``` +storage-provider flash /path/to/flash +``` +If you would like to use KeyDB FLASH storage, specify the storage medium followed by the directory path on your local SSD volume. Note that this feature is still considered experimental and should be used with discretion. See [FLASH Documentation](https://docs.keydb.dev/docs/flash) for more details on configuration and setting up your FLASH volume. + + Building KeyDB -------------- @@ -135,9 +141,9 @@ To append a suffix to KeyDB program names, use: ***Note that the following dependencies may be needed: % sudo apt-get install autoconf autotools-dev libnuma-dev libtool -To buik=ld with TLS support, use: +KeyDB by default is built with TLS enabled. To build without TLS support, use: - % make BUILD_TLS=yes + % make BUILD_TLS=no Running the tests with TLS enabled (you will need `tcl-tls` installed): @@ -145,6 +151,12 @@ installed): % ./utils/gen-test-certs.sh % ./runtest --tls +To build with KeyDB FLASH support, use: + + % make ENABLE_FLASH=yes + +***Note that the KeyDB FLASH feature is considered experimental (beta) and should used with discretion + Fixing build problems with dependencies or cached build options --------- diff --git a/build.yaml b/build.yaml index 0500f5b7f..744ae947a 100644 --- a/build.yaml +++ b/build.yaml @@ -21,13 +21,13 @@ machamp: parent: make-build # https://github.sc-corp.net/Snapchat/img/tree/master/keydb/ubuntu-20-04 builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/keydb/keydb-ubuntu-20-04@sha256:cf869a3f5d1de1e1d976bb906689c37b7031938eb68661b844a38c532f27248c - command: ./runtest --clients $(nproc) --verbose --tls + command: ./runtest --clients 4 --verbose cluster-test: type: cmd parent: make-build # https://github.sc-corp.net/Snapchat/img/tree/master/keydb/ubuntu-20-04 builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/keydb/keydb-ubuntu-20-04@sha256:cf869a3f5d1de1e1d976bb906689c37b7031938eb68661b844a38c532f27248c - command: ./runtest-cluster --tls + command: ./runtest-cluster sentinel-test: type: cmd parent: make-build diff --git a/ci.yaml b/ci.yaml index 5614be8e6..ab346113f 100644 --- a/ci.yaml +++ b/ci.yaml @@ -9,7 +9,7 @@ on: build_name: keydb-build arch_types: ["amd64", "arm64"] push: - - branches: [master] + - branches: [main] workflows: - workflow_type: backend_workflow build_name: keydb-build diff --git a/cmake/BundleIcon.icns b/cmake/BundleIcon.icns new file mode 100644 index 000000000..8808dd62d Binary files /dev/null and b/cmake/BundleIcon.icns differ diff --git a/cmake/CTestConfig.cmake b/cmake/CTestConfig.cmake new file mode 100644 index 000000000..62e029d94 --- /dev/null +++ b/cmake/CTestConfig.cmake @@ -0,0 +1,7 @@ +set(CTEST_PROJECT_NAME "keydb") +set(CTEST_NIGHTLY_START_TIME "00:00:00 EST") + +set(CTEST_DROP_METHOD "http") +set(CTEST_DROP_SITE "my.cdash.org") +set(CTEST_DROP_LOCATION "/submit.php?project=keydb") +set(CTEST_DROP_SITE_CDASH TRUE) diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in new file mode 100644 index 000000000..3649fbfcf --- /dev/null +++ b/cmake/Config.cmake.in @@ -0,0 +1,4 @@ + +@PACKAGE_INIT@ + +include ( "${CMAKE_CURRENT_LIST_DIR}/keydbTargets.cmake" ) diff --git a/cmake/CustomVolumeIcon.icns b/cmake/CustomVolumeIcon.icns new file mode 100644 index 000000000..3862a5191 Binary files /dev/null and b/cmake/CustomVolumeIcon.icns differ diff --git a/cmake/Info.plist b/cmake/Info.plist new file mode 100644 index 000000000..e5a7d0047 --- /dev/null +++ b/cmake/Info.plist @@ -0,0 +1,14 @@ + + + + + CFBundleExecutable + BundleGeneratorTest + CFBundleIconFile + BundleGeneratorTest.icns + CFBundleInfoDictionaryVersion + 6.0 + CFBundlePackageType + APPL + + diff --git a/cmake/MultiCPackConfig.cmake b/cmake/MultiCPackConfig.cmake new file mode 100644 index 000000000..4c1f3d5d3 --- /dev/null +++ b/cmake/MultiCPackConfig.cmake @@ -0,0 +1,6 @@ +include("release/CPackConfig.cmake") + +set(CPACK_INSTALL_CMAKE_PROJECTS + "debug;keydb;ALL;/" + "release;keydb;ALL;/" +) diff --git a/cmake/README.txt b/cmake/README.txt new file mode 100644 index 000000000..8e39c4763 --- /dev/null +++ b/cmake/README.txt @@ -0,0 +1,7 @@ +keydb + +KeyDB is a high performance fork of Redis with a focus on multithreading, memory efficiency, and high throughput. In addition to performance improvements, KeyDB offers features such as Active Replication, FLASH Storage and Subkey Expires. KeyDB has a MVCC architecture that allows you to execute queries such as KEYS and SCAN without blocking the database and degrading performance. + +------------------------------------------------------------------------ + +Licensed under BSD-3-Clause diff --git a/cmake/Welcome.txt b/cmake/Welcome.txt new file mode 100644 index 000000000..1966e2326 --- /dev/null +++ b/cmake/Welcome.txt @@ -0,0 +1 @@ +This installs keydb. diff --git a/cmake/config.h.in b/cmake/config.h.in new file mode 100644 index 000000000..97c3a68ae --- /dev/null +++ b/cmake/config.h.in @@ -0,0 +1,9 @@ +#ifndef KEYDB_CONFIG_H +#define KEYDB_CONFIG_H + +#define KEYDB_VERSION_MAJOR @keydb_VERSION_MAJOR@ +#define KEYDB_VERSION_MINOR @keydb_VERSION_MINOR@ +#define KEYDB_VERSION_PATCH @keydb_VERSION_PATCH@ +#define KEYDB_VERSION "@keydb_VERSION@" + +#endif /* !KEYDB_CONFIG_H */ diff --git a/deps/Makefile b/deps/Makefile index 7f622b9c3..28cf4e01d 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -95,17 +95,17 @@ JEMALLOC_LDFLAGS= $(LDFLAGS) jemalloc: .make-prerequisites @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) - cd jemalloc && ./configure --with-version=5.2.1-0-g0 --with-lg-quantum=3 --with-jemalloc-prefix=je_ --disable-cxx CFLAGS="$(JEMALLOC_CFLAGS)" LDFLAGS="$(JEMALLOC_LDFLAGS)" + cd jemalloc && ./configure --with-version=5.2.1-0-g0 --with-lg-quantum=3 --disable-cxx CFLAGS="$(JEMALLOC_CFLAGS)" LDFLAGS="$(JEMALLOC_LDFLAGS)" cd jemalloc && $(MAKE) CFLAGS="$(JEMALLOC_CFLAGS)" LDFLAGS="$(JEMALLOC_LDFLAGS)" lib/libjemalloc.a .PHONY: jemalloc rocksdb: .make-prerequisites -# @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) + @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) ifeq ($(uname_M),x86_64) -# cd rocksdb && CFLAGS=-Wno-error PORTABLE=1 USE_SSE=1 FORCE_SSE42=1 $(MAKE) static_lib + cd rocksdb && CFLAGS=-Wno-error PORTABLE=1 USE_SSE=1 FORCE_SSE42=1 $(MAKE) static_lib else -# cd rocksdb && PORTABLE=1 $(MAKE) static_lib + cd rocksdb && PORTABLE=1 $(MAKE) static_lib endif .PHONY: rocksdb diff --git a/deps/rocksdb b/deps/rocksdb index e3169e3ea..444b3f484 160000 --- a/deps/rocksdb +++ b/deps/rocksdb @@ -1 +1 @@ -Subproject commit e3169e3ea8762d2f34880742106858a23c8dc8b7 +Subproject commit 444b3f4845dd01b0d127c4b420fdd3b50ad56682 diff --git a/keydb.conf b/keydb.conf index 355271150..02e97d4be 100644 --- a/keydb.conf +++ b/keydb.conf @@ -184,7 +184,9 @@ tcp-keepalive 300 # # tls-client-key-file-pass secret -# Configure a DH parameters file to enable Diffie-Hellman (DH) key exchange: +# Configure a DH parameters file to enable Diffie-Hellman (DH) key exchange, +# required by older versions of OpenSSL (<3.0). Newer versions do not require +# this configuration and recommend against it. # # tls-dh-params-file keydb.dh @@ -1143,6 +1145,11 @@ acllog-max-len 128 # # active-expire-effort 1 +# Force evictions when used system memory reaches X% of total system memory. +# This is useful as a safeguard to prevent OOM kills (0 to disable). +# +# force-eviction-percent 0 + ############################# LAZY FREEING #################################### # KeyDB has two primitives to delete keys. One is called DEL and is a blocking @@ -2071,4 +2078,31 @@ replica-weighting-factor 2 # # By default this is enabled # -active-client-balancing yes \ No newline at end of file +active-client-balancing yes + +# Enable FLASH support (Experimental Feature) +# storage-provider flash /path/to/flash/db + +# Blob support is a way to store very large objects (>200MB) on disk +# The files are automatically cleaned up when KeyDB exits and are only +# for temporary use. This helps reduce memory pressure for very large +# data items at the cost of some performance. +# +# By default this config is disable. When enabled the disk associated +# with KeyDB's working directory will be used. If there is insufficient +# disk space or any other I/O error KeyDB will instead use memory. +# +# blob-support false + +# Begin load shedding if we use more than X% CPU relative to the number of server threads +# E.g. if overload-protect-percent is set to 80 and there are 8 server-threads, then the +# actual CPU protection will be 8 * 100 * 0.80 = 640% CPU usage. +# +# Set to 0 to disable +# overload-protect-percent 0 + +# Inform KeyDB of the availability zone if running in a cloud environment. Currently +# this is only exposed via the info command for clients to use, but in the future we +# we may also use this when making decisions for replication. +# +# availability-zone "us-east-1a" \ No newline at end of file diff --git a/machamp_scripts/build.sh b/machamp_scripts/build.sh new file mode 100755 index 000000000..e05a9b3bd --- /dev/null +++ b/machamp_scripts/build.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# make the build +git submodule init && git submodule update +make BUILD_TLS=yes -j$(nproc) KEYDB_CFLAGS='-Werror' KEYDB_CXXFLAGS='-Werror' + +# gen-cert +./utils/gen-test-certs.sh \ No newline at end of file diff --git a/pkg/deb/conf/keydb.conf b/pkg/deb/conf/keydb.conf index 8be0b523e..9567087a4 100644 --- a/pkg/deb/conf/keydb.conf +++ b/pkg/deb/conf/keydb.conf @@ -2063,3 +2063,17 @@ server-threads 2 # # By default KeyDB sets this to 2. replica-weighting-factor 2 + +# Enable FLASH support (Experimental Feature) +# storage-provider flash /path/to/flash/db + +# Blob support is a way to store very large objects (>200MB) on disk +# The files are automatically cleaned up when KeyDB exits and are only +# for temporary use. This helps reduce memory pressure for very large +# data items at the cost of some performance. +# +# By default this config is disable. When enabled the disk associated +# with KeyDB's working directory will be used. If there is insufficient +# disk space or any other I/O error KeyDB will instead use memory. +# +# blob-support false diff --git a/pkg/deb/deb-buildsource.sh b/pkg/deb/deb-buildsource.sh index 8d50035f1..99108a2af 100755 --- a/pkg/deb/deb-buildsource.sh +++ b/pkg/deb/deb-buildsource.sh @@ -19,12 +19,17 @@ elif [ "$distributor" == "Ubuntu" ]; then fi codename=$(lsb_release --codename --short) date=$(date +%a," "%d" "%b" "%Y" "%T) + +# overwrite debian bookworm version until updated +if [ $codename == "bookworm" ]; then + distname=+deb12u1 +fi pkg_name=keydb-$majorv:$version$distname # create build tree cd ../../../ -tar -czvf keydb_$version.orig.tar.gz --force-local keydb-internal -cd keydb-internal/pkg/deb/ +tar -czvf keydb_$version.orig.tar.gz --force-local KeyDB +cd KeyDB/pkg/deb/ mkdir -p $pkg_name/tmp if [[ "$codename" == "xenial" ]] || [[ "$codename" == "stretch" ]]; then cp -r debian_dh9 $pkg_name/tmp/debian diff --git a/pkg/deb/debian/control b/pkg/deb/debian/control index 1091a4ba1..767f00cac 100644 --- a/pkg/deb/debian/control +++ b/pkg/deb/debian/control @@ -26,8 +26,8 @@ Build-Depends: libzstd-dev Standards-Version: 4.2.1 Homepage: https://docs.keydb.dev/ -Vcs-Git: https://github.com/EQ-Alpha/KeyDB.git -Vcs-Browser: https://github.com/EQ-Alpha/KeyDB +Vcs-Git: https://github.com/Snapchat/KeyDB.git +Vcs-Browser: https://github.com/Snapchat/KeyDB Package: keydb Architecture: all diff --git a/pkg/deb/debian/rules b/pkg/deb/debian/rules index 957e05f9a..51896759a 100755 --- a/pkg/deb/debian/rules +++ b/pkg/deb/debian/rules @@ -4,6 +4,7 @@ include /usr/share/dpkg/buildflags.mk export BUILD_TLS=yes export USE_SYSTEMD=yes +export ENABLE_FLASH=yes export CFLAGS CPPFLAGS LDFLAGS export DEB_BUILD_MAINT_OPTIONS = hardening=+all export DEB_LDFLAGS_MAINT_APPEND = -ldl -latomic $(LUA_LDFLAGS) diff --git a/pkg/deb/debian_dh9/control b/pkg/deb/debian_dh9/control index 0feec1220..657ad2066 100644 --- a/pkg/deb/debian_dh9/control +++ b/pkg/deb/debian_dh9/control @@ -25,8 +25,8 @@ Build-Depends: libzstd-dev Standards-Version: 4.2.1 Homepage: https://docs.keydb.dev/ -Vcs-Git: https://github.com/EQ-Alpha/KeyDB.git -Vcs-Browser: https://github.com/EQ-Alpha/KeyDB +Vcs-Git: https://github.com/Snapchat/KeyDB.git +Vcs-Browser: https://github.com/Snapchat/KeyDB Package: keydb Architecture: all diff --git a/pkg/deb/debian_dh9/rules b/pkg/deb/debian_dh9/rules index a568ae470..f2afc8a97 100755 --- a/pkg/deb/debian_dh9/rules +++ b/pkg/deb/debian_dh9/rules @@ -10,6 +10,7 @@ include /usr/share/dpkg/buildflags.mk export BUILD_TLS=yes export USE_SYSTEMD=yes +export ENABLE_FLASH=yes export CFLAGS CPPFLAGS LDFLAGS export DEB_BUILD_MAINT_OPTIONS = hardening=+all export DEB_LDFLAGS_MAINT_APPEND = -ldl -latomic $(LUA_LDFLAGS) diff --git a/pkg/deb/master_changelog b/pkg/deb/master_changelog index d3bf48042..31990bf3b 100644 --- a/pkg/deb/master_changelog +++ b/pkg/deb/master_changelog @@ -1,3 +1,11 @@ +keydb (6:6.3.1-1distribution_placeholder) codename_placeholder; urgency=medium + + * This point release contains fixes to bugs related to expires, active-rep, and rdb saving + * Issues fixed: #419, #422, #428 + * PRs: #426, #429, #431, #433 + +-- Ben Schermel Mon, 23 May 2022 20:00:37 +0000 + keydb (6:6.3.0-1distribution_placeholder) codename_placeholder; urgency=medium * This release open sources KeyDB Enterprise features into the open source project along with PSYNC for active replication diff --git a/pkg/docker/Dockerfile b/pkg/docker/Dockerfile index f0bbe2cca..9dc57b95b 100644 --- a/pkg/docker/Dockerfile +++ b/pkg/docker/Dockerfile @@ -53,7 +53,7 @@ RUN set -eux; \ grep -E '^ *createBoolConfig[(]"protected-mode",.*, *1 *,.*[)],$' ./src/config.cpp; \ sed -ri 's!^( *createBoolConfig[(]"protected-mode",.*, *)1( *,.*[)],)$!\10\2!' ./src/config.cpp; \ grep -E '^ *createBoolConfig[(]"protected-mode",.*, *0 *,.*[)],$' ./src/config.cpp; \ - make -j$(nproc) BUILD_TLS=yes; \ + make -j$(nproc) BUILD_TLS=yes ENABLE_FLASH=yes; \ cd src; \ strip keydb-cli keydb-benchmark keydb-check-rdb keydb-check-aof keydb-diagnostic-tool keydb-sentinel keydb-server; \ mv keydb-server keydb-cli keydb-benchmark keydb-check-rdb keydb-check-aof keydb-diagnostic-tool keydb-sentinel /usr/local/bin/; \ @@ -77,12 +77,13 @@ RUN \ mkdir /data && chown keydb:keydb /data; \ mkdir /flash && chown keydb:keydb /flash; \ mkdir -p /etc/keydb; \ - cp /tmp/keydb-internal/keydb.conf /etc/keydb/; \ + cp /tmp/KeyDB/keydb.conf /etc/keydb/; \ sed -i 's/^\(daemonize .*\)$/# \1/' /etc/keydb/keydb.conf; \ sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/keydb/keydb.conf; \ sed -i 's/^\(logfile .*\)$/# \1/' /etc/keydb/keydb.conf; \ sed -i 's/protected-mode yes/protected-mode no/g' /etc/keydb/keydb.conf; \ sed -i 's/^\(bind .*\)$/# \1/' /etc/keydb/keydb.conf; \ + cd /usr/local/bin; \ ln -s keydb-cli redis-cli; \ cd /etc/keydb; \ ln -s keydb.conf redis.conf; \ @@ -110,8 +111,3 @@ ENV KEYDB_PRO_DIRECTORY=/usr/local/bin/ ENTRYPOINT ["docker-entrypoint.sh"] EXPOSE 6379 CMD ["keydb-server","/etc/keydb/keydb.conf"] - - - - - diff --git a/pkg/docker/Dockerfile_Alpine b/pkg/docker/Dockerfile_Alpine new file mode 100644 index 000000000..028fea3a5 --- /dev/null +++ b/pkg/docker/Dockerfile_Alpine @@ -0,0 +1,82 @@ +FROM alpine:3.12 +# add our user and group first to make sure their IDs get assigned consistently, regardless of whatever dependencies get added +RUN addgroup -S -g 1000 keydb && adduser -S -G keydb -u 999 keydb +RUN mkdir -p /etc/keydb +ARG BRANCH +RUN set -eux; \ + \ + apk add --no-cache su-exec tini; \ + apk add --no-cache --virtual .build-deps \ + coreutils \ + gcc \ + linux-headers \ + make \ + musl-dev \ + openssl-dev \ + git \ + util-linux-dev \ + curl-dev \ + g++ \ + libunwind-dev \ + bash \ + perl \ + git \ + bzip2-dev \ + zstd-dev \ + lz4-dev \ + snappy-dev \ + ; \ + cd /tmp && git clone --branch $BRANCH https://github.com/Snapchat/KeyDB.git --recursive; \ + cd /tmp/KeyDB; \ + # disable protected mode as it relates to docker + grep -E '^ *createBoolConfig[(]"protected-mode",.*, *1 *,.*[)],$' ./src/config.cpp; \ + sed -ri 's!^( *createBoolConfig[(]"protected-mode",.*, *)1( *,.*[)],)$!\10\2!' ./src/config.cpp; \ + grep -E '^ *createBoolConfig[(]"protected-mode",.*, *0 *,.*[)],$' ./src/config.cpp; \ + make -j$(nproc) BUILD_TLS=yes ENABLE_FLASH=yes; \ + cd src; \ + strip keydb-cli keydb-benchmark keydb-check-rdb keydb-check-aof keydb-diagnostic-tool keydb-sentinel keydb-server; \ + mv keydb-server keydb-cli keydb-benchmark keydb-check-rdb keydb-check-aof keydb-diagnostic-tool keydb-sentinel /usr/local/bin/; \ + runDeps="$( \ + scanelf --needed --nobanner --format '%n#p' --recursive /usr/local \ + | tr ',' '\n' \ + | sort -u \ + | awk 'system("[ -e /usr/local/lib/" $1 " ]") == 0 { next } { print "so:" $1 }' \ + )"; \ + apk add --no-network --virtual .keydb-rundeps $runDeps; \ + apk del --no-network .build-deps; \ + # create working directories and organize files + mkdir /data && chown keydb:keydb /data; \ + mkdir /flash && chown keydb:keydb /flash; \ + mkdir -p /etc/keydb; \ + cp /tmp/KeyDB/keydb.conf /etc/keydb/; \ + sed -i 's/^\(daemonize .*\)$/# \1/' /etc/keydb/keydb.conf; \ + sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/keydb/keydb.conf; \ + sed -i 's/^\(logfile .*\)$/# \1/' /etc/keydb/keydb.conf; \ + sed -i 's/protected-mode yes/protected-mode no/g' /etc/keydb/keydb.conf; \ + sed -i 's/^\(bind .*\)$/# \1/' /etc/keydb/keydb.conf; \ + cd /usr/local/bin; \ + ln -s keydb-cli redis-cli; \ + cd /etc/keydb; \ + ln -s keydb.conf redis.conf; \ + rm -rf /tmp/* +# generate entrypoint script +RUN set -eux; \ + echo '#!/bin/sh' > /usr/local/bin/docker-entrypoint.sh; \ + echo 'set -e' >> /usr/local/bin/docker-entrypoint.sh; \ + echo "# first arg is '-f' or '--some-option'" >> /usr/local/bin/docker-entrypoint.sh; \ + echo "# or first arg is `something.conf`" >> /usr/local/bin/docker-entrypoint.sh; \ + echo 'if [ "${1#-}" != "$1" ] || [ "${1%.conf}" != "$1" ]; then' >> /usr/local/bin/docker-entrypoint.sh; \ + echo ' set -- keydb-server "$@"' >> /usr/local/bin/docker-entrypoint.sh; \ + echo 'fi' >> /usr/local/bin/docker-entrypoint.sh; \ + echo "# allow the container to be started with `--user`" >> /usr/local/bin/docker-entrypoint.sh; \ + echo 'if [ "$1" = "keydb-server" -a "$(id -u)" = "0" ]; then' >> /usr/local/bin/docker-entrypoint.sh; \ + echo " find . \! -user keydb -exec chown keydb '{}' +" >> /usr/local/bin/docker-entrypoint.sh; \ + echo ' exec su-exec keydb "$0" "$@"' >> /usr/local/bin/docker-entrypoint.sh; \ + echo 'fi' >> /usr/local/bin/docker-entrypoint.sh; \ + echo 'exec "$@"' >> /usr/local/bin/docker-entrypoint.sh; \ + chmod +x /usr/local/bin/docker-entrypoint.sh +VOLUME /data +WORKDIR /data +ENTRYPOINT ["tini", "--", "docker-entrypoint.sh"] +EXPOSE 6379 +CMD ["keydb-server", "/etc/keydb/keydb.conf"] diff --git a/pkg/rpm/keydb_build/keydb_rpm/etc/keydb/keydb.conf b/pkg/rpm/keydb_build/keydb_rpm/etc/keydb/keydb.conf index fca1c74e6..6cca339b1 100644 --- a/pkg/rpm/keydb_build/keydb_rpm/etc/keydb/keydb.conf +++ b/pkg/rpm/keydb_build/keydb_rpm/etc/keydb/keydb.conf @@ -2063,3 +2063,17 @@ server-threads 2 # # By default KeyDB sets this to 2. replica-weighting-factor 2 + +# Enable FLASH support (Experimental Feature) +# storage-provider flash /path/to/flash/db + +# Blob support is a way to store very large objects (>200MB) on disk +# The files are automatically cleaned up when KeyDB exits and are only +# for temporary use. This helps reduce memory pressure for very large +# data items at the cost of some performance. +# +# By default this config is disable. When enabled the disk associated +# with KeyDB's working directory will be used. If there is insufficient +# disk space or any other I/O error KeyDB will instead use memory. +# +# blob-support false diff --git a/runtest-moduleapi b/runtest-moduleapi index 8adf2171d..4f0bca851 100755 --- a/runtest-moduleapi +++ b/runtest-moduleapi @@ -37,5 +37,6 @@ $TCLSH tests/test_helper.tcl \ --single unit/moduleapi/hash \ --single unit/moduleapi/zset \ --single unit/moduleapi/stream \ +--single unit/moduleapi/load \ --config server-threads 3 \ "${@}" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 000000000..f19033a8d --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,233 @@ +get_filename_component(LIBRARY_NAME "${CMAKE_CURRENT_SOURCE_DIR}" NAME) + +set(Header_Files + "AsyncWorkQueue.h" + "IStorage.h" + "SnapshotPayloadParseState.h" + "StorageCache.h" + "adlist.h" + "ae.h" + "aelocker.h" + "anet.h" + "asciilogo.h" + "atomicvar.h" + "bio.h" + "cli_common.h" + "cluster.h" + "compactvector.h" + "config.h" + "connection.h" + "connhelpers.h" + "cowptr.h" + "crc16_slottable.h" + "crc64.h" + "crcspeed.h" + "cron.h" + "debugmacro.h" + "dict.h" + "endianconv.h" + "expire.h" + "fastlock.h" + "fmacros.h" + "gc.h" + "geo.h" + "geohash.h" + "geohash_helper.h" + "help.h" + "intset.h" + "latency.h" + "listpack.h" + "listpack_malloc.h" + "lolwut.h" + "lzf.h" + "lzfP.h" + "monotonic.h" + "motd.h" + "mt19937-64.h" + "new.h" + "pqsort.h" + "quicklist.h" + "rand.h" + "rax.h" + "rax_malloc.h" + "rdb.h" + "readwritelock.h" + "redis-cli.h" + "redisassert.h" + "redismodule.h" + "rio.h" + "sds.h" + "sdsalloc.h" + "semiorderedset.h" + "server.h" + "serverassert.h" + "sha1.h" + "sha256.h" + "slowlog.h" + "solarisfixes.h" + "sparkline.h" + "storage.h" + "stream.h" + "t_nhash.h" + "testhelp.h" + "util.h" + "uuid.h" + "version.h" + "ziplist.h" + "zipmap.h" + "zmalloc.h" + ) +source_group("Header Files" FILES "${Header_Files}") + +set(Source_Files + "AsyncWorkQueue.cpp" + "SnapshotPayloadParseState.cpp" + "StorageCache.cpp" + "acl.cpp" + "adlist.c" + "ae.cpp" + "ae_epoll.cpp" + "ae_evport.c" + "ae_kqueue.c" + "ae_select.c" + "anet.c" + "aof.cpp" + "bio.cpp" + "bitops.cpp" + "blocked.cpp" + "childinfo.cpp" + "cli_common.c" + "cluster.cpp" + "config.cpp" + "connection.cpp" + "crc16.c" + "crc64.c" + "crcspeed.c" + "cron.cpp" + "db.cpp" + "debug.cpp" + "defrag.cpp" + "dict.cpp" + "endianconv.c" + "evict.cpp" + "expire.cpp" + "fastlock.cpp" + "geo.cpp" + "geohash.c" + "geohash_helper.cpp" + "hyperloglog.cpp" + "intset.c" + "keydb-diagnostic-tool.cpp" + "keydbutils.cpp" + "latency.cpp" + "lazyfree.cpp" + "listpack.c" + "localtime.c" + "lolwut.c" + "lolwut5.c" + "lolwut6.c" + "lzf_c.c" + "lzf_d.c" + "meminfo.cpp" + "memtest.c" + "module.cpp" + "monotonic.c" + "motd.cpp" + "mt19937-64.c" + "multi.cpp" + "networking.cpp" + "new.cpp" + "notify.cpp" + "object.cpp" + "pqsort.c" + "pubsub.cpp" + "quicklist.c" + "rand.c" + "rax.c" + "rdb-s3.cpp" + "rdb.cpp" + "redis-benchmark.cpp" + "redis-check-aof.cpp" + "redis-check-rdb.cpp" + "redis-cli-cpphelper.cpp" + "redis-cli.c" + "release.c" + "replication.cpp" + "rio.cpp" + "scripting.cpp" + "sds.c" + "sentinel.cpp" + "server.cpp" + "setcpuaffinity.c" + "setproctitle.c" + "sha1.c" + "sha256.c" + "siphash.c" + "slowlog.cpp" + "snapshot.cpp" + "sort.cpp" + "sparkline.cpp" + "storage-lite.c" + "storage.cpp" + "syncio.cpp" + "t_hash.cpp" + "t_list.cpp" + "t_nhash.cpp" + "t_set.cpp" + "t_stream.cpp" + "t_string.cpp" + "t_zset.cpp" + "timeout.cpp" + "tls.cpp" + "tracking.cpp" + "util.c" + "ziplist.c" + "zipmap.c" + "zmalloc.cpp" + ) +source_group("Source Files" FILES "${Source_Files}") + +add_library("${LIBRARY_NAME}" "${Header_Files}" "${Source_Files}") + +target_link_libraries("${LIBRARY_NAME}" PRIVATE "${PROJECT_NAME}_compiler_flags") + +include(GenerateExportHeader) +set(_export_file "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}_export.h") +generate_export_header("${LIBRARY_NAME}" EXPORT_FILE_NAME "${_export_file}") + +include(GNUInstallDirs) +target_include_directories( + "${LIBRARY_NAME}" + PUBLIC + "$" + "$" + "$" +) + +#=========# +# Install # +#=========# + +# setup the version numbering +set_property(TARGET "${LIBRARY_NAME}" PROPERTY VERSION "1.0.0") +set_property(TARGET "${LIBRARY_NAME}" PROPERTY SOVERSION "1") + +install(FILES ${Header_Files} "${_export_file}" + TYPE "INCLUDE") + + +#====================# +# Package to install # +#====================# + +set(installable_libs "${LIBRARY_NAME}" "${PROJECT_NAME}_compiler_flags") +add_subdirectory("modules") +if (TARGET "${DEPENDANT_LIBRARY}") + list(APPEND installable_libs "${DEPENDANT_LIBRARY}") +endif () +install(TARGETS ${installable_libs} + EXPORT "${LIBRARY_NAME}Targets" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") +install(EXPORT "${LIBRARY_NAME}Targets" DESTINATION "${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}") diff --git a/src/IStorage.h b/src/IStorage.h index ad956beb6..dc608d490 100644 --- a/src/IStorage.h +++ b/src/IStorage.h @@ -14,6 +14,7 @@ class IStorageFactory virtual class IStorage *createMetadataDb() = 0; virtual const char *name() const = 0; virtual size_t totalDiskspaceUsed() const = 0; + virtual sdsstring getInfo() const = 0; virtual bool FSlow() const = 0; virtual size_t filedsRequired() const { return 0; } }; @@ -31,6 +32,7 @@ class IStorage virtual void retrieve(const char *key, size_t cchKey, callbackSingle fn) const = 0; virtual size_t clear() = 0; virtual bool enumerate(callback fn) const = 0; + virtual bool enumerate_hashslot(callback fn, unsigned int hashslot) const = 0; virtual size_t count() const = 0; virtual void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) { diff --git a/src/Makefile b/src/Makefile index d39806f80..a3cd741f4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -20,12 +20,12 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') OPTIMIZATION?=-O2 -flto -DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram rocksdb +DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram NODEPS:=clean distclean # Default settings STD=-pedantic -DREDIS_STATIC='' -CXX_STD=-std=c++14 -pedantic -fno-rtti -D__STDC_FORMAT_MACROS +CXX_STD=-std=c++17 -pedantic -fno-rtti -D__STDC_FORMAT_MACROS ifneq (,$(findstring clang,$(CC))) STD+=-Wno-c11-extensions else @@ -62,6 +62,7 @@ endif endif USEASM?=true +ENABLE_FLASH?=no ifneq ($(strip $(SANITIZE)),) CFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE -fno-omit-frame-pointer @@ -71,6 +72,15 @@ ifneq ($(strip $(SANITIZE)),) USEASM=false endif +ifeq ($(ENABLE_FLASH),yes) + FINAL_LIBS+= -lz -lcrypto -lbz2 -lzstd -llz4 -lsnappy + CXXFLAGS+= -I../deps/rocksdb/include/ -DENABLE_ROCKSDB + STORAGE_OBJ+= storage/rocksdb.o storage/rocksdbfactory.o + FINAL_LIBS+= ../deps/rocksdb/librocksdb.a + DEPENDENCY_TARGETS+= rocksdb +endif + + ifeq ($(CHECKED),true) CXXFLAGS+= -DCHECKED_BUILD endif @@ -135,7 +145,9 @@ FINAL_LDFLAGS=$(LDFLAGS) $(KEYDB_LDFLAGS) $(DEBUG) FINAL_LIBS+=-lm -lz -lcrypto ifneq ($(uname_S),Darwin) + ifneq ($(uname_S),FreeBSD) FINAL_LIBS+=-latomic + endif endif # Linux ARM32 needs -latomic at linking time ifneq (,$(findstring armv,$(uname_M))) @@ -201,7 +213,13 @@ ifeq ($(uname_S),NetBSD) else ifeq ($(uname_S),FreeBSD) # FreeBSD - FINAL_LIBS+= -lpthread -lexecinfo + FINAL_LIBS+= -lpthread -luuid -lexecinfo + FINAL_CFLAGS+= -I/usr/local/include + FINAL_CXXFLAGS+= -I/usr/local/include + FINAL_LDFLAGS+= -L/usr/local/lib + ifeq ($(USE_BACKTRACE),yes) + FINAL_CFLAGS+= -DUSE_BACKTRACE + endif else ifeq ($(uname_S),DragonFly) # DragonFly @@ -366,7 +384,7 @@ endif REDIS_SERVER_NAME=keydb-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=keydb-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd_server.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ) +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd_server.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o meminfo.o $(ASM_OBJ) $(STORAGE_OBJ) KEYDB_SERVER_OBJ=SnapshotPayloadParseState.o REDIS_CLI_NAME=keydb-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crcspeed.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o motd_client.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ) diff --git a/src/SnapshotPayloadParseState.cpp b/src/SnapshotPayloadParseState.cpp index 8ba4b109b..ef28e54f1 100644 --- a/src/SnapshotPayloadParseState.cpp +++ b/src/SnapshotPayloadParseState.cpp @@ -136,12 +136,12 @@ void SnapshotPayloadParseState::flushQueuedKeys() { int idb = current_database; serverAssert(vecqueuedKeys.size() == vecqueuedVals.size()); auto sizePrev = vecqueuedKeys.size(); - ++insertsInFlight; - auto &insertsInFlightTmp = insertsInFlight; // C++ GRRRRRRRRRRRRRRRR, we don't want to capute "this" because that's dangerous + (*insertsInFlight)++; + std::weak_ptr> insertsInFlightTmp = insertsInFlight; // C++ GRRRRRRRRRRRRRRRR, we don't want to capute "this" because that's dangerous if (current_database < cserver.dbnum) { - g_pserver->asyncworkqueue->AddWorkFunction([idb, vecqueuedKeys = std::move(this->vecqueuedKeys), vecqueuedKeysCb = std::move(this->vecqueuedKeysCb), vecqueuedVals = std::move(this->vecqueuedVals), vecqueuedValsCb = std::move(this->vecqueuedValsCb), &insertsInFlightTmp, pallocator = m_spallocator.release()]() mutable { - g_pserver->db[idb]->bulkStorageInsert(vecqueuedKeys.data(), vecqueuedKeysCb.data(), vecqueuedVals.data(), vecqueuedValsCb.data(), vecqueuedKeys.size()); - --insertsInFlightTmp; + g_pserver->asyncworkqueue->AddWorkFunction([idb, vecqueuedKeys = std::move(this->vecqueuedKeys), vecqueuedKeysCb = std::move(this->vecqueuedKeysCb), vecqueuedVals = std::move(this->vecqueuedVals), vecqueuedValsCb = std::move(this->vecqueuedValsCb), insertsInFlightTmp, pallocator = m_spallocator.release()]() mutable { + g_pserver->db[idb]->bulkDirectStorageInsert(vecqueuedKeys.data(), vecqueuedKeysCb.data(), vecqueuedVals.data(), vecqueuedValsCb.data(), vecqueuedKeys.size()); + (*(insertsInFlightTmp.lock()))--; delete pallocator; }); } else { @@ -172,7 +172,7 @@ SnapshotPayloadParseState::SnapshotPayloadParseState() { dictLongLongMetaData = dictCreate(&metadataLongLongDictType, nullptr); dictMetaData = dictCreate(&metadataDictType, nullptr); - insertsInFlight = 0; + insertsInFlight = std::make_shared>(); m_spallocator = std::make_unique(); } @@ -214,7 +214,7 @@ void SnapshotPayloadParseState::trimState() { if (stackParse.empty()) { flushQueuedKeys(); - while (insertsInFlight > 0) { + while (*insertsInFlight > 0) { // TODO: ProcessEventsWhileBlocked aeReleaseLock(); aeAcquireLock(); diff --git a/src/SnapshotPayloadParseState.h b/src/SnapshotPayloadParseState.h index cb1e0420a..29ac28fb6 100644 --- a/src/SnapshotPayloadParseState.h +++ b/src/SnapshotPayloadParseState.h @@ -36,7 +36,7 @@ class SnapshotPayloadParseState { std::vector vecqueuedValsCb; - std::atomic insertsInFlight; + std::shared_ptr> insertsInFlight; std::unique_ptr m_spallocator; dict *dictLongLongMetaData = nullptr; dict *dictMetaData = nullptr; @@ -62,5 +62,5 @@ class SnapshotPayloadParseState { void pushArray(long long size); void pushValue(const char *rgch, long long cch); void pushValue(long long value); - bool shouldThrottle() const { return insertsInFlight > (cserver.cthreads*4); } + bool shouldThrottle() const { return *insertsInFlight > (cserver.cthreads*4); } }; \ No newline at end of file diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index d336e2b3d..91d4b3657 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -21,7 +21,7 @@ dictType dbStorageCacheType = { StorageCache::StorageCache(IStorage *storage, bool fCache) : m_spstorage(storage) { - if (fCache) + if (!g_pserver->flash_disable_key_cache && fCache) m_pdict = dictCreate(&dbStorageCacheType, nullptr); } @@ -43,6 +43,8 @@ void StorageCache::clear(void(callback)(void*)) void StorageCache::clearAsync() { std::unique_lock ul(m_lock); + if (count() == 0) + return; if (m_pdict != nullptr) { dict *dSav = m_pdict; m_pdict = dictCreate(&dbStorageCacheType, nullptr); @@ -203,6 +205,7 @@ void StorageCache::beginWriteBatch() { } void StorageCache::emergencyFreeCache() { + std::unique_lock ul(m_lock); dict *d = m_pdict; m_pdict = nullptr; if (d != nullptr) { diff --git a/src/StorageCache.h b/src/StorageCache.h index 3c38450fb..4f3c1a374 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -49,6 +49,7 @@ class StorageCache void expand(uint64_t slots); bool enumerate(IStorage::callback fn) const { return m_spstorage->enumerate(fn); } + bool enumerate_hashslot(IStorage::callback fn, unsigned int hashslot) const { return m_spstorage->enumerate_hashslot(fn, hashslot); } void beginWriteBatch(); void endWriteBatch() { m_spstorage->endWriteBatch(); } diff --git a/src/ae.cpp b/src/ae.cpp index 702c465d4..99093daa8 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -716,11 +716,6 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) } } - if (eventLoop->flags & AE_DONT_WAIT) { - tv.tv_sec = tv.tv_usec = 0; - tvp = &tv; - } - if (eventLoop->beforesleep != NULL && flags & AE_CALL_BEFORE_SLEEP) { std::unique_lock ulock(g_lock, std::defer_lock); if (!(eventLoop->beforesleepFlags & AE_SLEEP_THREADSAFE)) { @@ -731,6 +726,11 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) eventLoop->beforesleep(eventLoop); } + if (eventLoop->flags & AE_DONT_WAIT) { + tv.tv_sec = tv.tv_usec = 0; + tvp = &tv; + } + /* Call the multiplexing API, will return only on timeout or when * some event fires. */ numevents = aeApiPoll(eventLoop, tvp); @@ -859,6 +859,11 @@ void aeReleaseForkLock() g_forkLock.downgradeWrite(); } +void aeForkLockInChild() +{ + g_forkLock.setNotify(false); +} + int aeThreadOwnsLock() { return fOwnLockOverride || g_lock.fOwnLock(); diff --git a/src/ae.h b/src/ae.h index cd513f652..3868db4a0 100644 --- a/src/ae.h +++ b/src/ae.h @@ -171,6 +171,7 @@ int aeTryAcquireLock(int fWeak); void aeThreadOffline(); void aeReleaseLock(); void aeReleaseForkLock(); +void aeForkLockInChild(); int aeThreadOwnsLock(); void aeSetThreadOwnsLockOverride(int fOverride); int aeLockContested(int threshold); diff --git a/src/anet.c b/src/anet.c index 0b3e462c3..c47853867 100644 --- a/src/anet.c +++ b/src/anet.c @@ -441,7 +441,7 @@ static int _anetTcpServer(char *err, int port, const char *bindaddr, int af, int char _port[6]; /* strlen("65535") */ struct addrinfo hints, *servinfo, *p; - snprintf(_port,6,"%d",port); + snprintf(_port,sizeof(_port),"%d",port); memset(&hints,0,sizeof(hints)); hints.ai_family = af; hints.ai_socktype = SOCK_STREAM; diff --git a/src/aof.cpp b/src/aof.cpp index 9fa9290bd..e529b4b0e 100644 --- a/src/aof.cpp +++ b/src/aof.cpp @@ -285,7 +285,7 @@ int startAppendOnly(void) { strerror(errno)); return C_ERR; } - if (hasActiveChildProcess() && g_pserver->child_type != CHILD_TYPE_AOF) { + if (hasActiveChildProcessOrBGSave() && g_pserver->child_type != CHILD_TYPE_AOF) { g_pserver->aof_rewrite_scheduled = 1; serverLog(LL_WARNING,"AOF was enabled but there is already another background operation. An AOF background was scheduled to start when possible."); } else { @@ -438,7 +438,7 @@ void flushAppendOnlyFile(int force) { * useful for graphing / monitoring purposes. */ if (sync_in_progress) { latencyAddSampleIfNeeded("aof-write-pending-fsync",latency); - } else if (hasActiveChildProcess()) { + } else if (hasActiveChildProcessOrBGSave()) { latencyAddSampleIfNeeded("aof-write-active-child",latency); } else { latencyAddSampleIfNeeded("aof-write-alone",latency); @@ -535,7 +535,7 @@ void flushAppendOnlyFile(int force) { try_fsync: /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are * children doing I/O in the background. */ - if (g_pserver->aof_no_fsync_on_rewrite && hasActiveChildProcess()) + if (g_pserver->aof_no_fsync_on_rewrite && hasActiveChildProcessOrBGSave()) return; /* Perform the fsync if needed. */ @@ -752,7 +752,7 @@ void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int a * accumulate the differences between the child DB and the current one * in a buffer, so that when the child process will do its work we * can append the differences to the new append only file. */ - if (g_pserver->child_type == CHILD_TYPE_AOF) + if (hasActiveChildProcess() && g_pserver->child_type == CHILD_TYPE_AOF) aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf)); sdsfree(buf); @@ -1658,7 +1658,7 @@ int rewriteAppendOnlyFile(char *filename) { { // BEGIN GOTO SCOPED VARIABLES /* Note that we have to use a different temp name here compared to the * one used by rewriteAppendOnlyFileBackground() function. */ - snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); + snprintf(tmpfile,sizeof(tmpfile),"temp-rewriteaof-%d.aof", (int) getpid()); fp = fopen(tmpfile,"w"); if (!fp) { serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno)); @@ -1887,7 +1887,7 @@ void aofClosePipes(void) { int rewriteAppendOnlyFileBackground(void) { pid_t childpid; - if (hasActiveChildProcess()) return C_ERR; + if (hasActiveChildProcessOrBGSave()) return C_ERR; if (aofCreatePipes() != C_OK) return C_ERR; if ((childpid = redisFork(CHILD_TYPE_AOF)) == 0) { char tmpfile[256]; @@ -1895,7 +1895,7 @@ int rewriteAppendOnlyFileBackground(void) { /* Child */ redisSetProcTitle("keydb-aof-rewrite"); redisSetCpuAffinity(g_pserver->aof_rewrite_cpulist); - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); + snprintf(tmpfile,sizeof(tmpfile),"temp-rewriteaof-bg-%d.aof", (int) getpid()); if (rewriteAppendOnlyFile(tmpfile) == C_OK) { sendChildCowInfo(CHILD_INFO_TYPE_AOF_COW_SIZE, "AOF rewrite"); exitFromChild(0); @@ -1930,7 +1930,7 @@ int rewriteAppendOnlyFileBackground(void) { void bgrewriteaofCommand(client *c) { if (g_pserver->child_type == CHILD_TYPE_AOF) { addReplyError(c,"Background append only file rewriting already in progress"); - } else if (hasActiveChildProcess()) { + } else if (hasActiveChildProcessOrBGSave()) { g_pserver->aof_rewrite_scheduled = 1; addReplyStatus(c,"Background append only file rewriting scheduled"); } else if (rewriteAppendOnlyFileBackground() == C_OK) { @@ -1944,10 +1944,10 @@ void bgrewriteaofCommand(client *c) { void aofRemoveTempFile(pid_t childpid) { char tmpfile[256]; - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); + snprintf(tmpfile,sizeof(tmpfile),"temp-rewriteaof-bg-%d.aof", (int) childpid); bg_unlink(tmpfile); - snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) childpid); + snprintf(tmpfile,sizeof(tmpfile),"temp-rewriteaof-%d.aof", (int) childpid); bg_unlink(tmpfile); } @@ -1985,7 +1985,7 @@ void backgroundRewriteDoneHandler(int exitcode, int bysignal) { /* Flush the differences accumulated by the parent to the * rewritten AOF. */ latencyStartMonitor(latency); - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", + snprintf(tmpfile,sizeof(tmpfile),"temp-rewriteaof-bg-%d.aof", (int)g_pserver->child_pid); newfd = open(tmpfile,O_WRONLY|O_APPEND); if (newfd == -1) { diff --git a/src/bitops.cpp b/src/bitops.cpp index 6c3ee1038..2f25822cb 100644 --- a/src/bitops.cpp +++ b/src/bitops.cpp @@ -322,7 +322,7 @@ int checkUnsignedBitfieldOverflow(uint64_t value, int64_t incr, uint64_t bits, i return 1; } -int checkSignedBitfieldOverflow(int64_t value, int64_t incr, uint64_t bits, int owtype, int64_t *limit) { +int checkSignedBitfieldOverflow(int64_t value, int64_t incr, int bits, int owtype, int64_t *limit) { int64_t max = (bits == 64) ? INT64_MAX : (((int64_t)1<<(bits-1))-1); int64_t min = (-max)-1; diff --git a/src/blocked.cpp b/src/blocked.cpp index 15f3087ce..56df022ea 100644 --- a/src/blocked.cpp +++ b/src/blocked.cpp @@ -732,6 +732,7 @@ void unblockClientWaitingData(client *c) { c->bpop.xread_group = NULL; c->bpop.xread_consumer = NULL; } + c->bpop.timeout = 0; } static int getBlockedTypeByType(int type) { diff --git a/src/childinfo.cpp b/src/childinfo.cpp index ec3e3c133..f1fba91dd 100644 --- a/src/childinfo.cpp +++ b/src/childinfo.cpp @@ -42,6 +42,7 @@ typedef struct { * RDB / AOF saving process from the child to the parent (for instance * the amount of copy on write memory used) */ void openChildInfoPipe(void) { + serverAssert(g_pserver->child_info_pipe[0] == -1); if (pipe(g_pserver->child_info_pipe) == -1) { /* On error our two file descriptors should be still set to -1, * but we call anyway closeChildInfoPipe() since can't hurt. */ diff --git a/src/cluster.cpp b/src/cluster.cpp index 398384f16..82ad3d271 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -771,7 +771,7 @@ unsigned long getClusterConnectionsCount(void) { /* We decrement the number of nodes by one, since there is the * "myself" node too in the list. Each node uses two file descriptors, * one incoming and one outgoing, thus the multiplication by 2. */ - return g_pserver->cluster_enabled ? + return g_pserver->cluster_enabled && g_pserver->cluster != nullptr ? ((dictSize(g_pserver->cluster->nodes)-1)*2) : 0; } @@ -1525,7 +1525,9 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { * it's greater than our view but is not in the future * (with 500 milliseconds tolerance) from the POV of our * clock. */ - if (pongtime <= (g_pserver->mstime+500) && + mstime_t mstime; + __atomic_load(&g_pserver->mstime, &mstime, __ATOMIC_RELAXED); + if (pongtime <= (mstime+500) && pongtime > node->pong_received) { node->pong_received = pongtime; @@ -4514,8 +4516,8 @@ void clusterCommand(client *c) { "NODES", " Return cluster configuration seen by node. Output format:", " ...", -"REPLICATE ", -" Configure current node as replica to .", +"REPLICATE (|NO ONE)", +" Configure current node as replica to or turn it into empty primary.", "RESET [HARD|SOFT]", " Reset current node (default: soft).", "SET-CONFIG-EPOCH ", @@ -4888,14 +4890,22 @@ NULL clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); - } else if (!strcasecmp(szFromObj(c->argv[1]),"replicate") && c->argc == 3) { - /* CLUSTER REPLICATE */ - clusterNode *n = clusterLookupNode(szFromObj(c->argv[2])); - - /* Lookup the specified node in our table. */ - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)ptrFromObj(c->argv[2])); - return; + } else if (!strcasecmp(szFromObj(c->argv[1]),"replicate") && (c->argc == 3 || c->argc == 4)) { + /* CLUSTER REPLICATE (|NO ONE) */ + clusterNode *n; + if (c->argc == 4) { + if (0 != strcasecmp(szFromObj(c->argv[2]),"NO") || 0 != strcasecmp(szFromObj(c->argv[3]),"ONE")) { + addReplySubcommandSyntaxError(c); + return; + } + n = nullptr; + } else { + /* Lookup the specified node in our table. */ + n = clusterLookupNode(szFromObj(c->argv[2])); + if (n == nullptr) { + addReplyErrorFormat(c,"Unknown node %s", (char*)ptrFromObj(c->argv[2])); + return; + } } /* I can't replicate myself. */ @@ -4905,7 +4915,7 @@ NULL } /* Can't replicate a slave. */ - if (nodeIsSlave(n)) { + if (n != nullptr && nodeIsSlave(n)) { addReplyError(c,"I can only replicate a master, not a replica."); return; } @@ -4921,8 +4931,26 @@ NULL return; } - /* Set the master. */ - clusterSetMaster(n); + if (n == nullptr) { + if (nodeIsMaster(myself)) { + addReply(c,shared.ok); + return; + } + serverLog(LL_NOTICE,"Stop replication and turning myself into empty primary."); + clusterSetNodeAsMaster(myself); + if (listLength(g_pserver->masters) > 0) + { + serverAssert(listLength(g_pserver->masters) == 1); + replicationUnsetMaster((redisMaster*)listFirst(g_pserver->masters)->value); + } + int empty_db_flags = g_pserver->repl_slave_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; + emptyDb(-1,empty_db_flags, nullptr); + /* Reset manual failover state. */ + resetManualFailover(); + } else { + /* Set the master. */ + clusterSetMaster(n); + } clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); } else if ((!strcasecmp(szFromObj(c->argv[1]),"slaves") || @@ -5105,7 +5133,7 @@ void createDumpPayload(rio *payload, robj_roptr o, robj *key) { serverAssert(rdbSaveObject(payload,o,key)); char szT[32]; uint64_t mvcc = mvccFromObj(o); - snprintf(szT, 32, "%" PRIu64, mvcc); + snprintf(szT, sizeof(szT), "%" PRIu64, mvcc); serverAssert(rdbSaveAuxFieldStrStr(payload,"mvcc-tstamp", szT) != -1); /* Write the footer, this is how it looks like: diff --git a/src/config.cpp b/src/config.cpp index a602dac15..574cf89cb 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -35,11 +35,13 @@ #include #include +#include #ifdef __linux__ #include #endif const char *KEYDB_SET_VERSION = KEYDB_REAL_VERSION; +size_t g_semiOrderedSetTargetBucketSize = 0; // Its a header only class so nowhere else for this to go /*----------------------------------------------------------------------------- * Config file name-value maps. @@ -355,16 +357,17 @@ bool initializeStorageProvider(const char **err) return true; if (!strcasecmp(g_sdsProvider, "flash") && g_sdsArgs != nullptr) { - // Temporary: Disable FLASH - serverLog(LL_WARNING, "FLASH Is Not yet Supported"); - _Exit(EXIT_FAILURE); - - +#ifdef ENABLE_ROCKSDB // Create The Storage Factory (if necessary) serverLog(LL_NOTICE, "Initializing FLASH storage provider (this may take a long time)"); adjustOpenFilesLimit(); g_pserver->m_pstorageFactory = CreateRocksDBStorageFactory(g_sdsArgs, cserver.dbnum, cserver.storage_conf, cserver.storage_conf ? strlen(cserver.storage_conf) : 0); - } +#else + serverLog(LL_WARNING, "To use the flash storage provider please compile KeyDB with ENABLE_FLASH=yes"); + serverLog(LL_WARNING, "Exiting due to the use of an unsupported storage provider"); + exit(EXIT_FAILURE); +#endif + } else if (!strcasecmp(g_sdsProvider, "test") && g_sdsArgs == nullptr) { g_pserver->m_pstorageFactory = new (MALLOC_LOCAL) TestStorageFactory(); @@ -746,6 +749,18 @@ void loadServerConfigFromString(char *config) { g_pserver->fActiveReplica = CONFIG_DEFAULT_ACTIVE_REPLICA; err = "argument must be 'yes' or 'no'"; goto loaderr; } + if (listLength(g_pserver->masters) && g_pserver->fActiveReplica) { + err = "must not set replica-of config before active-replica config"; goto loaderr; + } + } else if (!strcasecmp(argv[0], "multi-master") && argc == 2) { + g_pserver->enable_multimaster = yesnotoi(argv[1]); + if (g_pserver->enable_multimaster == -1) { + g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER; + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + if (listLength(g_pserver->masters) && g_pserver->enable_multimaster) { + err = "must not set replica-of config before multi-master config"; goto loaderr; + } } else if (!strcasecmp(argv[0], "tls-allowlist")) { if (argc < 2) { err = "must supply at least one element in the allow list"; goto loaderr; @@ -755,6 +770,15 @@ void loadServerConfigFromString(char *config) { } for (int i = 1; i < argc; i++) g_pserver->tls_allowlist.emplace(argv[i], strlen(argv[i])); + } else if (!strcasecmp(argv[0], "tls-auditlog-blocklist")) { + if (argc < 2) { + err = "must supply at least one element in the block list"; goto loaderr; + } + if (!g_pserver->tls_auditlog_blocklist.empty()) { + err = "tls-auditlog-blocklist may only be set once"; goto loaderr; + } + for (int i = 1; i < argc; i++) + g_pserver->tls_auditlog_blocklist.emplace(argv[i], strlen(argv[i])); } else if (!strcasecmp(argv[0], "version-override") && argc == 2) { KEYDB_SET_VERSION = zstrdup(argv[1]); serverLog(LL_WARNING, "Warning version is overriden to: %s\n", KEYDB_SET_VERSION); @@ -766,6 +790,12 @@ void loadServerConfigFromString(char *config) { g_sdsProvider = sdsdup(argv[1]); if (argc > 2) g_sdsArgs = sdsdup(argv[2]); + } else if (!strcasecmp(argv[0],"is-flash-enabled") && argc == 1) { +#ifdef ENABLE_ROCKSDB + exit(EXIT_SUCCESS); +#else + exit(EXIT_FAILURE); +#endif } else { err = "Bad directive or wrong number of arguments"; goto loaderr; } @@ -2091,7 +2121,10 @@ static void sdsConfigGet(client *c, typeData data) { } static void sdsConfigRewrite(typeData data, const char *name, struct rewriteConfigState *state) { - rewriteConfigSdsOption(state, name, *(data.sds.config), data.sds.default_value ? sdsnew(data.sds.default_value) : NULL); + sds sdsDefault = data.sds.default_value ? sdsnew(data.sds.default_value) : NULL; + rewriteConfigSdsOption(state, name, *(data.sds.config), sdsDefault); + if (sdsDefault) + sdsfree(sdsDefault); } @@ -2448,6 +2481,32 @@ static int isValidAOFfilename(char *val, const char **err) { return 1; } +static int isValidS3Bucket(char *s3bucket, const char **err) { + int status = EXIT_FAILURE; + pid_t pid = fork(); + if (pid < 0) + { + *err = "couldn't fork to call aws cli"; + return 0; + } + + if (pid == 0) + { + execlp("aws", "aws", "s3", "ls", s3bucket, nullptr); + exit(EXIT_FAILURE); + } + else + { + waitpid(pid, &status, 0); + } + + if (status != EXIT_SUCCESS) { + *err = "could not access s3 bucket"; + return 0; + } + return 1; +} + /* Validate specified string is a valid proc-title-template */ static int isValidProcTitleTemplate(char *val, const char **err) { if (!validateProcTitleTemplate(val)) { @@ -2580,7 +2639,7 @@ static int updateMaxclients(long long val, long long prev, const char **err) { adjustOpenFilesLimit(); if (g_pserver->maxclients != val) { static char msg[128]; - sprintf(msg, "The operating system is not able to handle the specified number of clients, try with %d", g_pserver->maxclients); + snprintf(msg, sizeof(msg), "The operating system is not able to handle the specified number of clients, try with %d", g_pserver->maxclients); *err = msg; if (g_pserver->maxclients > prev) { g_pserver->maxclients = prev; @@ -2619,7 +2678,7 @@ static int updateMaxclients(long long val, long long prev, const char **err) { if (res != AE_OK){ static char msg[128]; - sprintf(msg, "Failed to post the request to change setsize for Thread %d", iel); + snprintf(msg, sizeof(msg),"Failed to post the request to change setsize for Thread %d", iel); *err = msg; return 0; } @@ -2763,7 +2822,6 @@ standardConfig configs[] = { createBoolConfig("replica-serve-stale-data", "slave-serve-stale-data", MODIFIABLE_CONFIG, g_pserver->repl_serve_stale_data, 1, NULL, NULL), createBoolConfig("replica-read-only", "slave-read-only", MODIFIABLE_CONFIG, g_pserver->repl_slave_ro, 1, NULL, NULL), createBoolConfig("replica-ignore-maxmemory", "slave-ignore-maxmemory", MODIFIABLE_CONFIG, g_pserver->repl_slave_ignore_maxmemory, 1, NULL, NULL), - createBoolConfig("multi-master", NULL, IMMUTABLE_CONFIG, g_pserver->enable_multimaster,CONFIG_DEFAULT_ENABLE_MULTIMASTER, NULL, NULL), createBoolConfig("jemalloc-bg-thread", NULL, MODIFIABLE_CONFIG, cserver.jemalloc_bg_thread, 1, NULL, updateJemallocBgThread), createBoolConfig("activedefrag", NULL, MODIFIABLE_CONFIG, cserver.active_defrag_enabled, 0, isValidActiveDefrag, NULL), createBoolConfig("syslog-enabled", NULL, IMMUTABLE_CONFIG, g_pserver->syslog_enabled, 0, NULL, NULL), @@ -2771,7 +2829,7 @@ standardConfig configs[] = { createBoolConfig("appendonly", NULL, MODIFIABLE_CONFIG, g_pserver->aof_enabled, 0, NULL, updateAppendonly), createBoolConfig("cluster-allow-reads-when-down", NULL, MODIFIABLE_CONFIG, g_pserver->cluster_allow_reads_when_down, 0, NULL, NULL), createBoolConfig("delete-on-evict", NULL, MODIFIABLE_CONFIG, cserver.delete_on_evict, 0, NULL, NULL), - createBoolConfig("use-fork", NULL, IMMUTABLE_CONFIG, cserver.fForkBgSave, 0, NULL, NULL), + createBoolConfig("use-fork", NULL, IMMUTABLE_CONFIG, cserver.fForkBgSave, 1, NULL, NULL), createBoolConfig("io-threads-do-reads", NULL, IMMUTABLE_CONFIG, fDummy, 0, NULL, NULL), createBoolConfig("time-thread-priority", NULL, IMMUTABLE_CONFIG, cserver.time_thread_priority, 0, NULL, NULL), createBoolConfig("prefetch-enabled", NULL, MODIFIABLE_CONFIG, g_pserver->prefetch_enabled, 1, NULL, NULL), @@ -2795,6 +2853,7 @@ standardConfig configs[] = { createStringConfig("cluster-announce-ip", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, g_pserver->cluster_announce_ip, NULL, NULL, NULL), createStringConfig("syslog-ident", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, g_pserver->syslog_ident, "redis", NULL, NULL), createStringConfig("dbfilename", NULL, MODIFIABLE_CONFIG, ALLOW_EMPTY_STRING, g_pserver->rdb_filename, CONFIG_DEFAULT_RDB_FILENAME, isValidDBfilename, NULL), + createStringConfig("db-s3-object", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, g_pserver->rdb_s3bucketpath, NULL, isValidS3Bucket, NULL), createStringConfig("appendfilename", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, g_pserver->aof_filename, "appendonly.aof", isValidAOFfilename, NULL), createStringConfig("server_cpulist", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, g_pserver->server_cpulist, NULL, NULL, NULL), createStringConfig("bio_cpulist", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, g_pserver->bio_cpulist, NULL, NULL, NULL), @@ -2860,6 +2919,7 @@ standardConfig configs[] = { /* Unsigned int configs */ createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, g_pserver->maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), createUIntConfig("loading-process-events-interval-keys", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, g_pserver->loading_process_events_interval_keys, 8192, MEMORY_CONFIG, NULL, NULL), + createUIntConfig("maxclients-reserved", NULL, MODIFIABLE_CONFIG, 0, 100, g_pserver->maxclientsReserved, 0, INTEGER_CONFIG, NULL, NULL), /* Unsigned Long configs */ createULongConfig("active-defrag-max-scan-fields", NULL, MODIFIABLE_CONFIG, 1, LONG_MAX, cserver.active_defrag_max_scan_fields, 1000, INTEGER_CONFIG, NULL, NULL), /* Default: keys with more than 1000 fields will be processed separately */ @@ -2873,9 +2933,10 @@ standardConfig configs[] = { createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("proto-max-bulk-len", NULL, MODIFIABLE_CONFIG, 1024*1024, LLONG_MAX, g_pserver->proto_max_bulk_len, 512ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */ createLongLongConfig("stream-node-max-entries", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->stream_node_max_entries, 100, INTEGER_CONFIG, NULL, NULL), - createLongLongConfig("repl-backlog-size", NULL, MODIFIABLE_CONFIG, 1, LLONG_MAX, g_pserver->repl_backlog_size, 1024*1024, MEMORY_CONFIG, NULL, updateReplBacklogSize), /* Default: 1mb */ + createLongLongConfig("repl-backlog-size", NULL, MODIFIABLE_CONFIG, 1, LLONG_MAX, g_pserver->repl_backlog_config_size, 1024*1024, MEMORY_CONFIG, NULL, updateReplBacklogSize), /* Default: 1mb */ createLongLongConfig("repl-backlog-disk-reserve", NULL, IMMUTABLE_CONFIG, 0, LLONG_MAX, cserver.repl_backlog_disk_size, 0, MEMORY_CONFIG, NULL, NULL), createLongLongConfig("max-snapshot-slip", NULL, MODIFIABLE_CONFIG, 0, 5000, g_pserver->snapshot_slip, 400, 0, NULL, NULL), + createLongLongConfig("max-rand-count", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX/2, g_pserver->rand_total_threshold, LONG_MAX/2, 0, NULL, NULL), /* Unsigned Long Long configs */ createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory), @@ -2902,6 +2963,12 @@ standardConfig configs[] = { createBoolConfig("multi-master-no-forward", NULL, MODIFIABLE_CONFIG, cserver.multimaster_no_forward, 0, validateMultiMasterNoForward, NULL), createBoolConfig("allow-write-during-load", NULL, MODIFIABLE_CONFIG, g_pserver->fWriteDuringActiveLoad, 0, NULL, NULL), createBoolConfig("force-backlog-disk-reserve", NULL, MODIFIABLE_CONFIG, cserver.force_backlog_disk, 0, NULL, NULL), + createBoolConfig("soft-shutdown", NULL, MODIFIABLE_CONFIG, g_pserver->config_soft_shutdown, 0, NULL, NULL), + createBoolConfig("flash-disable-key-cache", NULL, MODIFIABLE_CONFIG, g_pserver->flash_disable_key_cache, 0, NULL, NULL), + createSizeTConfig("semi-ordered-set-bucket-size", NULL, MODIFIABLE_CONFIG, 0, 1024, g_semiOrderedSetTargetBucketSize, 0, INTEGER_CONFIG, NULL, NULL), + createSDSConfig("availability-zone", NULL, MODIFIABLE_CONFIG, 0, g_pserver->sdsAvailabilityZone, "", NULL, NULL), + createIntConfig("overload-protect-percent", NULL, MODIFIABLE_CONFIG, 0, 200, g_pserver->overload_protect_threshold, 0, INTEGER_CONFIG, NULL, NULL), + createIntConfig("force-eviction-percent", NULL, MODIFIABLE_CONFIG, 0, 100, g_pserver->force_eviction_percent, 0, INTEGER_CONFIG, NULL, NULL), #ifdef USE_OPENSSL createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, g_pserver->tls_port, 0, INTEGER_CONFIG, NULL, updateTLSPort), /* TCP port. */ diff --git a/src/connection.cpp b/src/connection.cpp index 2c34596bb..d2ca0ceea 100644 --- a/src/connection.cpp +++ b/src/connection.cpp @@ -161,6 +161,11 @@ static void connSocketClose(connection *conn) { return; } + if (conn->fprint) { + zfree(conn->fprint); + conn->fprint = NULL; + } + zfree(conn); } diff --git a/src/connection.h b/src/connection.h index bffde3f5c..0b0b6603a 100644 --- a/src/connection.h +++ b/src/connection.h @@ -51,6 +51,7 @@ typedef enum { #define CONN_FLAG_WRITE_BARRIER (1<<1) /* Write barrier requested */ #define CONN_FLAG_READ_THREADSAFE (1<<2) #define CONN_FLAG_WRITE_THREADSAFE (1<<3) +#define CONN_FLAG_AUDIT_LOGGING_REQUIRED (1<<4) #define CONN_TYPE_SOCKET 1 #define CONN_TYPE_TLS 2 @@ -86,6 +87,7 @@ struct connection { ConnectionCallbackFunc write_handler; ConnectionCallbackFunc read_handler; int fd; + char* fprint; }; /* The connection module does not deal with listening and accepting sockets, diff --git a/src/db.cpp b/src/db.cpp index d9e37816d..297910b9e 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -93,7 +93,7 @@ static void lookupKeyUpdateObj(robj *val, int flags) /* Update the access time for the ageing algorithm. * Don't do it if we have a saving child, as this will trigger * a copy on write madness. */ - if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) + if (!hasActiveChildProcessOrBGSave() && !(flags & LOOKUP_NOTOUCH)) { if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { updateLFU(val); @@ -1059,7 +1059,7 @@ void keysCommand(client *c) { sds pattern = szFromObj(c->argv[1]); const redisDbPersistentDataSnapshot *snapshot = nullptr; - if (!(c->flags & (CLIENT_MULTI | CLIENT_BLOCKED))) + if (!(c->flags & (CLIENT_MULTI | CLIENT_BLOCKED | CLIENT_DENY_BLOCKING)) && !(serverTL->in_eval || serverTL->in_exec)) snapshot = c->db->createSnapshot(c->mvccCheckpoint, true /* fOptional */); if (snapshot != nullptr) { @@ -1224,7 +1224,7 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { } } - if (o == nullptr && count >= 100) + if (o == nullptr && count >= 100 && !(serverTL->in_eval || serverTL->in_exec)) { // Do an async version if (c->asyncCommand( @@ -1462,6 +1462,11 @@ void shutdownCommand(client *c) { flags |= SHUTDOWN_NOSAVE; } else if (!strcasecmp(szFromObj(c->argv[1]),"save")) { flags |= SHUTDOWN_SAVE; + } else if (!strcasecmp(szFromObj(c->argv[1]), "soft")) { + g_pserver->soft_shutdown = true; + serverLog(LL_WARNING, "Soft Shutdown Initiated"); + addReply(c, shared.ok); + return; } else { addReplyErrorObject(c,shared.syntaxerr); return; @@ -1568,6 +1573,12 @@ void moveCommand(client *c) { return; } + /* Return zero if the key already exists in the target DB */ + if (lookupKeyWrite(dst,c->argv[1]) != NULL) { + addReply(c,shared.czero); + return; + } + std::unique_ptr spexpire; { // scope pexpireOld std::unique_lock ul(g_expireLock); @@ -1582,12 +1593,6 @@ void moveCommand(client *c) { dbDelete(src,c->argv[1]); g_pserver->dirty++; - /* Return zero if the key already exists in the target DB */ - if (lookupKeyWrite(dst,c->argv[1]) != NULL) { - addReply(c,shared.czero); - decrRefCount(o); - return; - } dbAdd(dst,c->argv[1],o); if (spexpire != nullptr) setExpire(c,dst,c->argv[1],std::move(*spexpire)); @@ -1732,6 +1737,9 @@ int dbSwapDatabases(int id1, int id2) { id2 < 0 || id2 >= cserver.dbnum) return C_ERR; if (id1 == id2) return C_OK; std::swap(g_pserver->db[id1], g_pserver->db[id2]); + + //swap db's id too, otherwise db does not match its id + std::swap(g_pserver->db[id1]->id, g_pserver->db[id2]->id); /* Note that we don't swap blocking_keys, * ready_keys and watched_keys, since we want clients to @@ -1761,7 +1769,7 @@ int dbSwapDatabases(int id1, int id2) { /* SWAPDB db1 db2 */ void swapdbCommand(client *c) { - int id1, id2; + int id1, id2, oriIdx; /* Not allowed in cluster mode: we have just DB 0 there. */ if (g_pserver->cluster_enabled) { @@ -1778,6 +1786,14 @@ void swapdbCommand(client *c) { "invalid second DB index") != C_OK) return; + // get client's original db's index + for (int idb=0; idb < cserver.dbnum; ++idb) { + if (g_pserver->db[idb]->id == c->db->id) { + oriIdx = idb; + break; + } + } + /* Swap... */ if (dbSwapDatabases(id1,id2) == C_ERR) { addReplyError(c,"DB index is out of range"); @@ -1786,6 +1802,18 @@ void swapdbCommand(client *c) { RedisModuleSwapDbInfo si = {REDISMODULE_SWAPDBINFO_VERSION,(int32_t)id1,(int32_t)id2}; moduleFireServerEvent(REDISMODULE_EVENT_SWAPDB,0,&si); g_pserver->dirty++; + + // set client's db to original db + c->db=g_pserver->db[oriIdx]; + + // Persist the databse index to dbid mapping into FLASH for later recovery. + if (g_pserver->m_pstorageFactory != nullptr && g_pserver->metadataDb != nullptr) { + std::string dbid_key = "db-" + std::to_string(id1); + g_pserver->metadataDb->insert(dbid_key.c_str(), dbid_key.length(), &g_pserver->db[id1]->id, sizeof(g_pserver->db[id1]->id), true); + + dbid_key = "db-" + std::to_string(id2); + g_pserver->metadataDb->insert(dbid_key.c_str(), dbid_key.length(), &g_pserver->db[id2]->id, sizeof(g_pserver->db[id2]->id), true); + } addReply(c,shared.ok); } } @@ -1926,6 +1954,7 @@ void setExpire(client *c, redisDb *db, robj *key, expireEntry &&e) * is associated with this key (i.e. the key is non volatile) */ expireEntry *redisDbPersistentDataSnapshot::getExpire(const char *key) { /* No expire? return ASAP */ + std::unique_lock ul(g_expireLock); if (expireSize() == 0) return nullptr; @@ -2474,25 +2503,28 @@ void slotToKeyUpdateKeyCore(const char *key, size_t keylen, int add) { serverAssert(GlobalLocksAcquired()); unsigned int hashslot = keyHashSlot(key,keylen); - unsigned char buf[64]; - unsigned char *indexed = buf; - g_pserver->cluster->slots_keys_count[hashslot] += add ? 1 : -1; - if (keylen+2 > 64) indexed = (unsigned char*)zmalloc(keylen+2, MALLOC_SHARED); - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - memcpy(indexed+2,key,keylen); - int fModified = false; - if (add) { - fModified = raxInsert(g_pserver->cluster->slots_to_keys,indexed,keylen+2,NULL,NULL); - } else { - fModified = raxRemove(g_pserver->cluster->slots_to_keys,indexed,keylen+2,NULL); + + if (g_pserver->m_pstorageFactory == nullptr) { + unsigned char buf[64]; + unsigned char *indexed = buf; + + if (keylen+2 > 64) indexed = (unsigned char*)zmalloc(keylen+2, MALLOC_SHARED); + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + memcpy(indexed+2,key,keylen); + int fModified = false; + if (add) { + fModified = raxInsert(g_pserver->cluster->slots_to_keys,indexed,keylen+2,NULL,NULL); + } else { + fModified = raxRemove(g_pserver->cluster->slots_to_keys,indexed,keylen+2,NULL); + } + // This assert is disabled when a snapshot depth is >0 because prepOverwriteForSnapshot will add in a tombstone, + // this prevents ensure from adding the key to the dictionary which means the caller isn't aware we're already tracking + // the key. + serverAssert(fModified || g_pserver->db[0]->snapshot_depth() > 0); + if (indexed != buf) zfree(indexed); } - // This assert is disabled when a snapshot depth is >0 because prepOverwriteForSnapshot will add in a tombstone, - // this prevents ensure from adding the key to the dictionary which means the caller isn't aware we're already tracking - // the key. - serverAssert(fModified || g_pserver->db[0]->snapshot_depth() > 0); - if (indexed != buf) zfree(indexed); } void slotToKeyAdd(sds key) { @@ -2528,47 +2560,67 @@ void slotToKeyFlush(int async) { * New objects are returned to represent keys, it's up to the caller to * decrement the reference count to release the keys names. */ unsigned int getKeysInSlot(unsigned int hashslot, robj **keys, unsigned int count) { - raxIterator iter; - int j = 0; - unsigned char indexed[2]; + if (g_pserver->m_pstorageFactory != nullptr) { + int j = 0; + g_pserver->db[0]->getStorageCache()->enumerate_hashslot([&](const char *key, size_t cchKey, const void *, size_t )->bool { + keys[j++] = createStringObject(key, cchKey); + return --count; + }, hashslot); + return j; + } else { + raxIterator iter; + int j = 0; + unsigned char indexed[2]; - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - raxStart(&iter,g_pserver->cluster->slots_to_keys); - raxSeek(&iter,">=",indexed,2); - while(count-- && raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - keys[j++] = createStringObject((char*)iter.key+2,iter.key_len-2); + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + raxStart(&iter,g_pserver->cluster->slots_to_keys); + raxSeek(&iter,">=",indexed,2); + while(count-- && raxNext(&iter)) { + if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; + keys[j++] = createStringObject((char*)iter.key+2,iter.key_len-2); + } + raxStop(&iter); + return j; } - raxStop(&iter); - return j; } /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ unsigned int delKeysInSlot(unsigned int hashslot) { serverAssert(GlobalLocksAcquired()); - - raxIterator iter; - int j = 0; - unsigned char indexed[2]; - - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - raxStart(&iter,g_pserver->cluster->slots_to_keys); - while(g_pserver->cluster->slots_keys_count[hashslot]) { - raxSeek(&iter,">=",indexed,2); - raxNext(&iter); - - auto count = g_pserver->cluster->slots_keys_count[hashslot]; - robj *key = createStringObject((char*)iter.key+2,iter.key_len-2); - dbDelete(g_pserver->db[0],key); - serverAssert(count > g_pserver->cluster->slots_keys_count[hashslot]); // we should have deleted something or we will be in an infinite loop - decrRefCount(key); - j++; + if (g_pserver->m_pstorageFactory != nullptr) { + int j = 0; + g_pserver->db[0]->getStorageCache()->enumerate_hashslot([&](const char *key, size_t cchKey, const void *, size_t )->bool { + robj *keyobj = createStringObject(key, cchKey); + dbDelete(g_pserver->db[0], keyobj); + decrRefCount(keyobj); + j++; + return true; + }, hashslot); + return j; + } else { + raxIterator iter; + int j = 0; + unsigned char indexed[2]; + + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + raxStart(&iter,g_pserver->cluster->slots_to_keys); + while(g_pserver->cluster->slots_keys_count[hashslot]) { + raxSeek(&iter,">=",indexed,2); + raxNext(&iter); + + auto count = g_pserver->cluster->slots_keys_count[hashslot]; + robj *key = createStringObject((char*)iter.key+2,iter.key_len-2); + dbDelete(g_pserver->db[0],key); + serverAssert(count > g_pserver->cluster->slots_keys_count[hashslot]); // we should have deleted something or we will be in an infinite loop + decrRefCount(key); + j++; + } + raxStop(&iter); + return j; } - raxStop(&iter); - return j; } unsigned int countKeysInSlot(unsigned int hashslot) { @@ -2602,6 +2654,17 @@ void clusterStorageLoadCallback(const char *rgchkey, size_t cch, void *) slotToKeyUpdateKeyCore(rgchkey, cch, true /*add*/); } +void moduleLoadCallback(const char * rgchKey, size_t, void *data) { + redisObjectStack keyobj; + initStaticStringObject(keyobj, const_cast(rgchKey)); + moduleNotifyKeyspaceEvent(NOTIFY_LOADED, "loaded", &keyobj, *(int *)data); +} + +void moduleClusterLoadCallback(const char * rgchKey, size_t cchKey, void *data) { + clusterStorageLoadCallback(rgchKey, cchKey, data); + moduleLoadCallback(rgchKey, cchKey, data); +} + void redisDb::initialize(int id) { redisDbPersistentData::initialize(); @@ -2620,8 +2683,8 @@ void redisDb::storageProviderInitialize() { if (g_pserver->m_pstorageFactory != nullptr) { - IStorageFactory::key_load_iterator itr = (g_pserver->cluster_enabled) ? clusterStorageLoadCallback : nullptr; - this->setStorageProvider(StorageCache::create(g_pserver->m_pstorageFactory, id, itr, nullptr)); + IStorageFactory::key_load_iterator itr = g_pserver->cluster_enabled ? moduleClusterLoadCallback : moduleLoadCallback; + this->setStorageProvider(StorageCache::create(g_pserver->m_pstorageFactory, id, itr, &id)); } } @@ -2639,7 +2702,10 @@ bool redisDbPersistentData::insert(char *key, robj *o, bool fAssumeNew, dict_ite ensure(key); dictEntry *de; int res = dictAdd(m_pdict, key, o, &de); - serverAssert(FImplies(fAssumeNew, res == DICT_OK)); + if (!FImplies(fAssumeNew, res == DICT_OK)) { + serverLog(LL_WARNING, + "Assumed new key %s existed in DB.", key); + } if (res == DICT_OK) { #ifdef CHECKED_BUILD @@ -2708,8 +2774,11 @@ void redisDbPersistentData::clear(void(callback)(void*)) m_cnewKeysPending = 0; m_fAllChanged++; } + { + std::unique_lock ul(g_expireLock); delete m_setexpire; m_setexpire = new (MALLOC_LOCAL) expireset(); + } if (m_spstorage != nullptr) m_spstorage->clear(callback); dictEmpty(m_pdictTombstone,callback); @@ -2842,7 +2911,8 @@ void redisDbPersistentData::ensure(const char *sdsKey, dictEntry **pde) { dictAdd(m_pdict, sdsNewKey, o); o->SetFExpires(spexpire != nullptr); - + + std::unique_lock ul(g_expireLock); if (spexpire != nullptr) { auto itr = m_setexpire->find(sdsKey); @@ -2865,6 +2935,7 @@ void redisDbPersistentData::ensure(const char *sdsKey, dictEntry **pde) if (*pde != nullptr && dictGetVal(*pde) != nullptr) { robj *o = (robj*)dictGetVal(*pde); + std::unique_lock ul(g_expireLock); serverAssert(o->FExpires() == (m_setexpire->find(sdsKey) != m_setexpire->end())); } } @@ -2986,8 +3057,16 @@ void redisDbPersistentData::processChangesAsync(std::atomic &pendingJobs) }); } -void redisDbPersistentData::bulkStorageInsert(char **rgKeys, size_t *rgcbKeys, char **rgVals, size_t *rgcbVals, size_t celem) +/* This function is to bulk insert directly to storage provider bypassing in memory, assumes rgKeys and rgVals are not sds strings */ +void redisDbPersistentData::bulkDirectStorageInsert(char **rgKeys, size_t *rgcbKeys, char **rgVals, size_t *rgcbVals, size_t celem) { + if (g_pserver->cluster_enabled) { + aeAcquireLock(); + for (size_t i = 0; i < celem; i++) { + slotToKeyUpdateKeyCore(rgKeys[i], rgcbKeys[i], 1); + } + aeReleaseLock(); + } m_spstorage->bulkInsert(rgKeys, rgcbKeys, rgVals, rgcbVals, celem); } @@ -3229,6 +3308,7 @@ std::unique_ptr deserializeExpire(sds key, const char *str, size_t sds serializeStoredObjectAndExpire(redisDbPersistentData *db, const char *key, robj_roptr o) { + std::unique_lock ul(g_expireLock); auto itrExpire = db->setexpire()->find(key); const expireEntry *pexpire = nullptr; if (itrExpire != db->setexpire()->end()) @@ -3249,7 +3329,7 @@ int dbnumFromDb(redisDb *db) serverPanic("invalid database pointer"); } -bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command, bool fExecOK) +void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command) { if (m_spstorage == nullptr) { #if defined(__x86_64__) || defined(__i386__) @@ -3261,7 +3341,7 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command const char *cmd = szFromObj(command.argv[0]); if (!strcasecmp(cmd, "set") || !strcasecmp(cmd, "get")) { if (c->db->m_spdbSnapshotHOLDER != nullptr) - return false; // this is dangerous enough without a snapshot around + return; // this is dangerous enough without a snapshot around auto h = dictSdsHash(szFromObj(command.argv[1])); for (int iht = 0; iht < 2; ++iht) { auto hT = h & c->db->m_pdict->ht[iht].sizemask; @@ -3281,7 +3361,7 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command } } #endif - return false; + return; } AeLocker lock; @@ -3291,7 +3371,11 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command getKeysResult result = GETKEYS_RESULT_INIT; auto cmd = lookupCommand(szFromObj(command.argv[0])); if (cmd == nullptr) - return false; // Bad command? It's not for us to judge, just bail + return; // Bad command? It's not for us to judge, just bail + + if (command.argc < std::abs(cmd->arity)) + return; // Invalid number of args + int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result); for (int ikey = 0; ikey < numkeys; ++ikey) { @@ -3323,7 +3407,6 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command } } - bool fNoInsert = false; if (!vecInserts.empty()) { lock.arm(c); for (auto &tuple : vecInserts) @@ -3339,19 +3422,18 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command // While unlocked this was already ensured decrRefCount(o); sdsfree(sharedKey); - fNoInsert = true; } else { if (spexpire != nullptr) { if (spexpire->when() < mstime()) { - fNoInsert = true; break; } } dictAdd(m_pdict, sharedKey, o); o->SetFExpires(spexpire != nullptr); + std::unique_lock ul(g_expireLock); if (spexpire != nullptr) { auto itr = m_setexpire->find(sharedKey); @@ -3372,12 +3454,5 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command lock.disarm(); } - if (fExecOK && !fNoInsert && cmd->proc == getCommand && !vecInserts.empty()) { - robj *o = std::get<1>(vecInserts[0]); - if (o != nullptr) { - addReplyBulk(c, o); - return true; - } - } - return false; -} \ No newline at end of file + return; +} diff --git a/src/debug.cpp b/src/debug.cpp index b709937c9..688ca3a0e 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -341,10 +341,10 @@ void mallctl_int(client *c, robj **argv, int argc) { } size_t sz = sizeof(old); while (sz > 0) { - if ((ret=je_mallctl(szFromObj(argv[0]), &old, &sz, argc > 1? &val: NULL, argc > 1?sz: 0))) { + if ((ret=mallctl(szFromObj(argv[0]), &old, &sz, argc > 1? &val: NULL, argc > 1?sz: 0))) { if (ret == EPERM && argc > 1) { /* if this option is write only, try just writing to it. */ - if (!(ret=je_mallctl(szFromObj(argv[0]), NULL, 0, &val, sz))) { + if (!(ret=mallctl(szFromObj(argv[0]), NULL, 0, &val, sz))) { addReply(c, shared.ok); return; } @@ -375,7 +375,7 @@ void mallctl_string(client *c, robj **argv, int argc) { char *old; size_t sz = sizeof(old); /* for strings, it seems we need to first get the old value, before overriding it. */ - if ((rret=je_mallctl(szFromObj(argv[0]), &old, &sz, NULL, 0))) { + if ((rret=mallctl(szFromObj(argv[0]), &old, &sz, NULL, 0))) { /* return error unless this option is write only. */ if (!(rret == EPERM && argc > 1)) { addReplyErrorFormat(c,"%s", strerror(rret)); @@ -387,7 +387,7 @@ void mallctl_string(client *c, robj **argv, int argc) { char **valref = &val; if ((!strcmp(val,"VOID"))) valref = NULL, sz = 0; - wret = je_mallctl(szFromObj(argv[0]), NULL, 0, valref, sz); + wret = mallctl(szFromObj(argv[0]), NULL, 0, valref, sz); } if (!rret) addReplyBulkCString(c, old); diff --git a/src/dict.cpp b/src/dict.cpp index b29c0e24b..51234df87 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -1577,7 +1577,7 @@ char *stringFromLongLong(long long value) { int len; char *s; - len = sprintf(buf,"%lld",value); + len = snprintf(buf,sizeof(buf),"%lld",value); s = zmalloc(len+1); memcpy(s, buf, len); s[len] = '\0'; diff --git a/src/endianconv.c b/src/endianconv.c index 98ed405a5..76ae347fa 100644 --- a/src/endianconv.c +++ b/src/endianconv.c @@ -112,15 +112,15 @@ int endianconvTest(int argc, char *argv[], int accurate) { UNUSED(argv); UNUSED(accurate); - sprintf(buf,"ciaoroma"); + snprintf(buf,sizeof(buf),"ciaoroma"); memrev16(buf); printf("%s\n", buf); - sprintf(buf,"ciaoroma"); + snprintf(buf,sizeof(buf),"ciaoroma"); memrev32(buf); printf("%s\n", buf); - sprintf(buf,"ciaoroma"); + snprintf(buf,sizeof(buf),"ciaoroma"); memrev64(buf); printf("%s\n", buf); diff --git a/src/evict.cpp b/src/evict.cpp index 719e7a761..1523b2814 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -380,7 +380,7 @@ size_t freeMemoryGetNotCountedMemory(void) { /* also don't count the replication backlog memory * that's where the replication clients get their memory from */ - overhead += g_pserver->repl_backlog_size; + overhead += g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size; if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); @@ -411,8 +411,13 @@ size_t freeMemoryGetNotCountedMemory(void) { * memory currently used. May be > 1 if we are over the memory * limit. * (Populated both for C_ERR and C_OK) + * + * 'reason' the reason why the memory limit was exceeded + * EVICT_REASON_USER: reported user memory exceeded maxmemory + * EVICT_REASON_SYS: available system memory under configurable threshold + * (Populated when C_ERR is returned) */ -int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level, bool fQuickCycle, bool fPreSnapshot) { +int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level, EvictReason *reason, bool fQuickCycle, bool fPreSnapshot) { size_t mem_reported, mem_used, mem_tofree; /* Check if we are over the memory usage limit. If we are not, no need @@ -421,10 +426,22 @@ int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *lev if (total) *total = mem_reported; size_t maxmemory = g_pserver->maxmemory; if (fPreSnapshot) - maxmemory = static_cast(maxmemory * 0.9); // derate memory by 10% since we won't be able to free during snapshot - if (g_pserver->FRdbSaveInProgress()) + maxmemory = static_cast(maxmemory*0.9); // derate memory by 10% since we won't be able to free during snapshot + if (g_pserver->FRdbSaveInProgress() && !cserver.fForkBgSave) maxmemory = static_cast(maxmemory*1.2); + /* If available system memory is below a certain threshold, force eviction */ + long long sys_available_mem_buffer = 0; + if (g_pserver->force_eviction_percent && g_pserver->cron_malloc_stats.sys_total) { + float available_mem_ratio = (float)(100 - g_pserver->force_eviction_percent)/100; + size_t min_available_mem = static_cast(g_pserver->cron_malloc_stats.sys_total * available_mem_ratio); + sys_available_mem_buffer = static_cast(g_pserver->cron_malloc_stats.sys_available - min_available_mem); + if (sys_available_mem_buffer < 0) { + long long mem_threshold = mem_reported + sys_available_mem_buffer; + maxmemory = ((long long)maxmemory < mem_threshold) ? maxmemory : static_cast(mem_threshold); + } + } + /* We may return ASAP if there is no need to compute the level. */ int return_ok_asap = !maxmemory || mem_reported <= maxmemory; if (return_ok_asap && !level) return C_OK; @@ -435,6 +452,12 @@ int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *lev size_t overhead = freeMemoryGetNotCountedMemory(); mem_used = (mem_used > overhead) ? mem_used-overhead : 0; + /* If system available memory is too low, we want to force evictions no matter + * what so we also offset the overhead from maxmemory. */ + if (sys_available_mem_buffer < 0) { + maxmemory = (maxmemory > overhead) ? maxmemory-overhead : 0; + } + /* Compute the ratio of memory usage. */ if (level) { if (!maxmemory) { @@ -459,6 +482,8 @@ int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *lev if (logical) *logical = mem_used; if (tofree) *tofree = mem_tofree; + if (reason) *reason = sys_available_mem_buffer < 0 ? EvictReason::System : EvictReason::User; + return C_ERR; } @@ -483,6 +508,7 @@ class FreeMemoryLazyFree : public ICollectable for (auto de : pair.second) { dictFreeUnlinkedEntry(pair.first, de); } + dictRelease(pair.first); } aeReleaseLock(); --s_clazyFreesInProgress; @@ -513,6 +539,7 @@ class FreeMemoryLazyFree : public ICollectable ); if (itr == vecdictvecde.end() || itr->first != d) { itr = vecdictvecde.insert(itr, std::make_pair(d, std::vector())); + __atomic_fetch_add(&d->refcount, 1, __ATOMIC_ACQ_REL); } serverAssert(itr->first == d); itr->second.push_back(de); @@ -552,7 +579,7 @@ static int evictionTimeProc( UNUSED(clientData); serverAssert(GlobalLocksAcquired()); - if (performEvictions(false) == EVICT_RUNNING) return 0; /* keep evicting */ + if (performEvictions((bool)clientData) == EVICT_RUNNING) return 0; /* keep evicting */ /* For EVICT_OK - things are good, no need to keep evicting. * For EVICT_FAIL - there is nothing left to evict. */ @@ -602,6 +629,12 @@ static unsigned long evictionTimeLimitUs() { return ULONG_MAX; /* No limit to eviction time */ } +static void updateSysAvailableMemory() { + if (g_pserver->force_eviction_percent) { + g_pserver->cron_malloc_stats.sys_available = getMemAvailable(); + } +} + /* Check that memory usage is within the current "maxmemory" limit. If over * "maxmemory", attempt to free memory by evicting data (if it's safe to do so). * @@ -639,10 +672,11 @@ int performEvictions(bool fPreSnapshot) { const bool fEvictToStorage = !cserver.delete_on_evict && g_pserver->db[0]->FStorageProvider(); int result = EVICT_FAIL; int ckeysFailed = 0; + EvictReason evictReason; std::unique_ptr splazy = std::make_unique(); - if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL,false,fPreSnapshot) == C_OK) + if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL,&evictReason,false,fPreSnapshot) == C_OK) return EVICT_OK; if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION) @@ -771,6 +805,7 @@ int performEvictions(bool fPreSnapshot) { if (db->removeCachedValue(bestkey, &deT)) { mem_freed += splazy->addEntry(db->dictUnsafeKeyOnly(), deT); ckeysFailed = 0; + g_pserver->stat_evictedkeys++; } else { delta = 0; @@ -824,6 +859,9 @@ int performEvictions(bool fPreSnapshot) { * across the dbAsyncDelete() call, while the thread can * release the memory all the time. */ if (g_pserver->lazyfree_lazy_eviction) { + if (evictReason == EvictReason::System) { + updateSysAvailableMemory(); + } if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) { break; } @@ -837,7 +875,7 @@ int performEvictions(bool fPreSnapshot) { if (!isEvictionProcRunning && serverTL->el != nullptr) { isEvictionProcRunning = 1; aeCreateTimeEvent(serverTL->el, 0, - evictionTimeProc, NULL, NULL); + evictionTimeProc, (void*)fPreSnapshot, NULL); } break; } @@ -851,9 +889,13 @@ int performEvictions(bool fPreSnapshot) { if (splazy != nullptr && splazy->memory_queued() > 0 && !serverTL->gcEpoch.isReset()) { g_pserver->garbageCollector.enqueue(serverTL->gcEpoch, std::move(splazy)); - } + } cant_free: + if (mem_freed > 0 && evictReason == EvictReason::System) { + updateSysAvailableMemory(); + } + if (g_pserver->m_pstorageFactory) { if (mem_reported < g_pserver->maxmemory*1.2) { diff --git a/src/expire.cpp b/src/expire.cpp index 2f284d513..b727183ad 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -481,6 +481,7 @@ void expireSlaveKeys(void) { if (slaveKeysWithExpire == NULL || dictSize(slaveKeysWithExpire) == 0) return; + std::unique_lock ul(g_expireLock); int cycles = 0, noexpire = 0; mstime_t start = mstime(); while(1) { diff --git a/src/gc.h b/src/gc.h index b60c562a3..eb7a42ff3 100644 --- a/src/gc.h +++ b/src/gc.h @@ -16,7 +16,17 @@ class GarbageCollector struct EpochHolder { uint64_t tstamp; - std::vector> m_vecObjs; + std::unique_ptr>> m_spvecObjs; + + EpochHolder() { + m_spvecObjs = std::make_unique>>(); + } + + // Support move operators + EpochHolder(EpochHolder &&other) = default; + EpochHolder &operator=(EpochHolder &&) = default; + + bool operator<(uint64_t tstamp) const { @@ -108,12 +118,12 @@ class GarbageCollector { EpochHolder e; e.tstamp = m_epochNext+1; - e.m_vecObjs.push_back(std::move(sp)); + e.m_spvecObjs->push_back(std::move(sp)); m_listepochs.emplace_back(std::move(e)); } else { - itr->m_vecObjs.push_back(std::move(sp)); + itr->m_spvecObjs->push_back(std::move(sp)); } } @@ -123,4 +133,4 @@ class GarbageCollector std::list m_listepochs; std::unordered_set m_setepochOutstanding; uint64_t m_epochNext = 0; -}; \ No newline at end of file +}; diff --git a/src/help.h b/src/help.h index c6d7affd6..efb79a063 100644 --- a/src/help.h +++ b/src/help.h @@ -1071,7 +1071,7 @@ struct commandHelp { 1, "2.2.0" }, { "SHUTDOWN", - "[NOSAVE|SAVE]", + "[NOSAVE|SAVE|SOFT]", "Synchronously save the dataset to disk and then shut down the server", 9, "1.0.0" }, diff --git a/src/keydb-diagnostic-tool.cpp b/src/keydb-diagnostic-tool.cpp index 74b46ce83..2368043b0 100644 --- a/src/keydb-diagnostic-tool.cpp +++ b/src/keydb-diagnostic-tool.cpp @@ -904,7 +904,7 @@ int main(int argc, const char **argv) { while (self_threads < config.max_threads) { for (int i = 0; i < config.numclients; i++) { - sprintf(command, "SET %d %s\r\n", self_threads * config.numclients + i, set_value); + snprintf(command, sizeof(command), "SET %d %s\r\n", self_threads * config.numclients + i, set_value); createClient(command, strlen(command), NULL,self_threads); } diff --git a/src/meminfo.cpp b/src/meminfo.cpp new file mode 100644 index 000000000..4d2b63e36 --- /dev/null +++ b/src/meminfo.cpp @@ -0,0 +1,32 @@ +#include +#include +#include + +static size_t getMemKey(std::string key) { +# ifdef __linux__ + std::string token; + std::ifstream f("/proc/meminfo"); + while (f >> token) { + if (token == key) { + size_t mem_val; + if (f >> mem_val) { + return mem_val * 1024; // values are in kB + } else { + return 0; + } + f.ignore(std::numeric_limits::max(), '\n'); + } + } + return 0; +# else + return 0; +# endif +} + +size_t getMemAvailable() { + return getMemKey("MemAvailable:"); +} + +size_t getMemTotal() { + return getMemKey("MemTotal:"); +} \ No newline at end of file diff --git a/src/module.cpp b/src/module.cpp index cf3c81c1a..eceb80c75 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -656,7 +656,7 @@ void moduleHandlePropagationAfterCommandCallback(RedisModuleCtx *ctx) { /* We don't need to do anything here if the server isn't inside * a transaction. */ - if (!g_pserver->propagate_in_transaction) return; + if (!serverTL->propagate_in_transaction) return; /* If this command is executed from with Lua or MULTI/EXEC we do not * need to propagate EXEC */ @@ -1814,7 +1814,7 @@ void moduleReplicateMultiIfNeeded(RedisModuleCtx *ctx) { * the module command was called by a script. */ if (serverTL->in_eval || serverTL->in_exec) return; /* If we already emitted MULTI return ASAP. */ - if (g_pserver->propagate_in_transaction) return; + if (serverTL->propagate_in_transaction) return; /* If this is a thread safe context, we do not want to wrap commands * executed into MULTI/EXEC, they are executed as single commands * from an external client in essence. */ @@ -7058,7 +7058,7 @@ RedisModuleString *RM_DictPrev(RedisModuleCtx *ctx, RedisModuleDictIter *di, voi /* Compare the element currently pointed by the iterator to the specified * element given by key/keylen, according to the operator 'op' (the set of * valid operators are the same valid for RedisModule_DictIteratorStart). - * If the comparision is successful the command returns REDISMODULE_OK + * If the comparison is successful the command returns REDISMODULE_OK * otherwise REDISMODULE_ERR is returned. * * This is useful when we want to just emit a lexicographical range, so diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt new file mode 100644 index 000000000..32229dd35 --- /dev/null +++ b/src/modules/CMakeLists.txt @@ -0,0 +1,19 @@ +get_filename_component(base_name "${CMAKE_CURRENT_SOURCE_DIR}" NAME) + +foreach (module + "helloacl" + "helloblock" + "hellocluster" + "hellodict" + "hellohook" + "hellotimer" + "hellotype" + "helloworld") + set(LIBRARY_NAME "${PROJECT_NAME}_${base_name}") + source_group("${LIBRARY_NAME} Source Files" FILES "${module}.c") + + add_library("${LIBRARY_NAME}" "${Source_Files}") + + target_link_libraries("${LIBRARY_NAME}" PRIVATE "${PROJECT_NAME}_compiler_flags") + list(APPEND installable_libs "${LIBRARY_NAME}") +endforeach () diff --git a/src/modules/hellotimer.c b/src/modules/hellotimer.c index f6700df26..4afc67967 100644 --- a/src/modules/hellotimer.c +++ b/src/modules/hellotimer.c @@ -51,8 +51,9 @@ int TimerCommand_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int for (int j = 0; j < 10; j++) { int delay = rand() % 5000; - char *buf = RedisModule_Alloc(256); - snprintf(buf,256,"After %d", delay); + int bufsize = 256; + char *buf = RedisModule_Alloc(bufsize); + snprintf(buf,bufsize,"After %d", delay); RedisModuleTimerID tid = RedisModule_CreateTimer(ctx,delay,timerHandler,buf); REDISMODULE_NOT_USED(tid); } diff --git a/src/multi.cpp b/src/multi.cpp index 207ce9c40..b2f1ccf22 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -118,14 +118,14 @@ void discardCommand(client *c) { void beforePropagateMulti() { /* Propagating MULTI */ - serverAssert(!g_pserver->propagate_in_transaction); - g_pserver->propagate_in_transaction = 1; + serverAssert(!serverTL->propagate_in_transaction); + serverTL->propagate_in_transaction = 1; } void afterPropagateExec() { /* Propagating EXEC */ - serverAssert(g_pserver->propagate_in_transaction == 1); - g_pserver->propagate_in_transaction = 0; + serverAssert(serverTL->propagate_in_transaction == 1); + serverTL->propagate_in_transaction = 0; } /* Send a MULTI command to all the slaves and AOF file. Check the execCommand @@ -264,7 +264,7 @@ void execCommand(client *c) { /* Make sure the EXEC command will be propagated as well if MULTI * was already propagated. */ - if (g_pserver->propagate_in_transaction) { + if (serverTL->propagate_in_transaction) { int is_master = listLength(g_pserver->masters) == 0; g_pserver->dirty++; /* If inside the MULTI/EXEC block this instance was suddenly diff --git a/src/networking.cpp b/src/networking.cpp index e22a2d8a1..1728b5f29 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -145,7 +145,7 @@ client *createClient(connection *conn, int iel) { client_id = g_pserver->next_client_id.fetch_add(1); c->iel = iel; c->id = client_id; - sprintf(c->lock.szName, "client %" PRIu64, client_id); + snprintf(c->lock.szName, sizeof(c->lock.szName), "client %" PRIu64, client_id); c->resp = 2; c->conn = conn; c->name = NULL; @@ -777,11 +777,11 @@ void setDeferredAggregateLen(client *c, void *node, long length, char prefix) { * we return NULL in addReplyDeferredLen() */ if (node == NULL) return; char lenstr[128]; - size_t lenstr_len = sprintf(lenstr, "%c%ld\r\n", prefix, length); + size_t lenstr_len = snprintf(lenstr, sizeof(lenstr), "%c%ld\r\n", prefix, length); setDeferredReply(c, node, lenstr, lenstr_len); } else { char lenstr[128]; - int lenstr_len = sprintf(lenstr, "%c%ld\r\n", prefix, length); + int lenstr_len = snprintf(lenstr, sizeof(lenstr), "%c%ld\r\n", prefix, length); size_t idxSplice = (size_t)node; serverAssert(idxSplice <= c->replyAsync->used); @@ -1177,6 +1177,11 @@ int chooseBestThreadForAccept() void clientAcceptHandler(connection *conn) { client *c = (client*)connGetPrivateData(conn); + if (conn->flags & CONN_FLAG_AUDIT_LOGGING_REQUIRED) { + c->flags |= CLIENT_AUDIT_LOGGING; + c->fprint = conn->fprint; + } + if (connGetState(conn) != CONN_STATE_CONNECTED) { serverLog(LL_WARNING, "Error accepting a client connection: %s", @@ -1240,6 +1245,7 @@ void clientAcceptHandler(connection *conn) { #define MAX_ACCEPTS_PER_CALL 1000 #define MAX_ACCEPTS_PER_CALL_TLS 100 + static void acceptCommonHandler(connection *conn, int flags, char *ip, int iel) { client *c; char conninfo[100]; @@ -1256,21 +1262,9 @@ static void acceptCommonHandler(connection *conn, int flags, char *ip, int iel) return; } - /* Limit the number of connections we take at the same time. - * - * Admission control will happen before a client is created and connAccept() - * called, because we don't want to even start transport-level negotiation - * if rejected. */ - if (listLength(g_pserver->clients) + getClusterConnectionsCount() - >= g_pserver->maxclients) - { - const char *err; - if (g_pserver->cluster_enabled) - err = "-ERR max number of clients + cluster " - "connections reached\r\n"; - else - err = "-ERR max number of clients reached\r\n"; - + /* Prevent new connections if we're in a soft shutdown situation */ + if (g_pserver->soft_shutdown) { + const char *err = "-SHUTDOWN"; /* That's a best effort error message, don't check write errors. * Note that for TLS connections, no handshake was done yet so nothing * is written and the connection will just drop. */ @@ -1282,6 +1276,35 @@ static void acceptCommonHandler(connection *conn, int flags, char *ip, int iel) return; } + /* Limit the number of connections we take at the same time. + * + * Admission control will happen before a client is created and connAccept() + * called, because we don't want to even start transport-level negotiation + * if rejected. */ + if (listLength(g_pserver->clients) + getClusterConnectionsCount() + >= (g_pserver->maxclients - g_pserver->maxclientsReserved)) + { + // Allow the connection if it comes from localhost and we're within the maxclientReserved buffer range + if ((listLength(g_pserver->clients) + getClusterConnectionsCount()) >= g_pserver->maxclients || strcmp("127.0.0.1", ip)) { + const char *err; + if (g_pserver->cluster_enabled) + err = "-ERR max number of clients + cluster " + "connections reached\r\n"; + else + err = "-ERR max number of clients reached\r\n"; + + /* That's a best effort error message, don't check write errors. + * Note that for TLS connections, no handshake was done yet so nothing + * is written and the connection will just drop. */ + if (connWrite(conn,err,strlen(err)) == -1) { + /* Nothing to do, Just to avoid the warning... */ + } + g_pserver->stat_rejected_conn++; + connClose(conn); + return; + } + } + /* Create connection and client */ if ((c = createClient(conn, iel)) == NULL) { serverLog(LL_WARNING, @@ -1985,9 +2008,9 @@ void ProcessPendingAsyncWrites() * writes may have been signalled without having been copied to the replyAsync buffer, * thus causing the buffer to be NULL */ if (c->replyAsync != nullptr){ - int size = c->replyAsync->used; + size_t size = c->replyAsync->used; - if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { + if (listLength(c->reply) == 0 && size <= static_cast(PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); c->bufpos += size; } else { @@ -2581,9 +2604,7 @@ void parseClientCommandBuffer(client *c) { (g_pserver->m_pstorageFactory || aeLockContested(cserver.cthreads/2) || cserver.cthreads == 1) && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { - if (c->db->prefetchKeysAsync(c, query, c->vecqueuedcmd.size() == 1)) { - c->vecqueuedcmd.erase(c->vecqueuedcmd.begin()); - } + c->db->prefetchKeysAsync(c, query); } } c->reqtype = 0; @@ -2741,7 +2762,7 @@ void readQueryFromClient(connection *conn) { if (cserver.cthreads > 1 || g_pserver->m_pstorageFactory) { parseClientCommandBuffer(c); - if (g_pserver->enable_async_commands && !serverTL->disable_async_commands && listLength(g_pserver->monitors) == 0 && (aeLockContention() || serverTL->rgdbSnapshot[c->db->id] || g_fTestMode)) { + if (g_pserver->enable_async_commands && !serverTL->disable_async_commands && listLength(g_pserver->monitors) == 0 && (aeLockContention() || serverTL->rgdbSnapshot[c->db->id] || g_fTestMode) && !serverTL->in_eval && !serverTL->in_exec) { // Frequent writers aren't good candidates for this optimization, they cause us to renew the snapshot too often // so we exclude them unless the snapshot we need already exists. // Note: In test mode we want to create snapshots as often as possibl to excercise them - we don't care about perf @@ -3236,16 +3257,18 @@ NULL } else { - int iel = client->iel; freeClientAsync(client); - aePostFunction(g_pserver->rgthreadvar[client->iel].el, [iel] { // note: failure is OK - freeClientsInAsyncFreeQueue(iel); - }); } } killed++; } + for (int iel = 0; iel < cserver.cthreads; ++iel) { + aePostFunction(g_pserver->rgthreadvar[iel].el, [iel] { // note: failure is OK + freeClientsInAsyncFreeQueue(iel); + }); + } + /* Reply according to old/new format. */ if (c->argc == 3) { if (killed == 0) @@ -3983,7 +4006,7 @@ void pauseClients(mstime_t end, pause_type type) { * to track this state so that we don't assert * in propagate(). */ if (serverTL->in_exec) { - g_pserver->client_pause_in_transaction = 1; + serverTL->client_pause_in_transaction = 1; } } diff --git a/src/new.cpp b/src/new.cpp index 4e6b07dfd..3a299e7bf 100644 --- a/src/new.cpp +++ b/src/new.cpp @@ -41,10 +41,3 @@ void operator delete(void *p, std::size_t) noexcept } #endif - -#if defined(USE_JEMALLOC) -extern "C" size_t malloc_usable_size(void *ptr) -{ - return zmalloc_usable_size(ptr); -} -#endif diff --git a/src/object.cpp b/src/object.cpp index 44dcbd6e9..d3122dbbd 100644 --- a/src/object.cpp +++ b/src/object.cpp @@ -1091,7 +1091,7 @@ struct redisMemOverhead *getMemoryOverheadData(void) { mem_total += g_pserver->initial_memory_usage; mem = 0; - if (g_pserver->repl_backlog) + if (g_pserver->repl_backlog && g_pserver->repl_backlog != g_pserver->repl_backlog_disk) mem += zmalloc_size(g_pserver->repl_backlog); mh->repl_backlog = mem; mem_total += mem; @@ -1142,8 +1142,7 @@ struct redisMemOverhead *getMemoryOverheadData(void) { mem_total+=mem; std::unique_lock ul(g_expireLock); - mem = db->setexpire()->bytes_used(); - + mem = db->setexpire()->estimated_bytes_used(); mh->db[mh->num_dbs].overhead_ht_expires = mem; mem_total+=mem; @@ -1542,7 +1541,7 @@ void memoryCommand(client *c) { } else if (!strcasecmp(szFromObj(c->argv[1]),"malloc-stats") && c->argc == 2) { #if defined(USE_JEMALLOC) sds info = sdsempty(); - je_malloc_stats_print(inputCatSds, &info, NULL); + malloc_stats_print(inputCatSds, &info, NULL); addReplyVerbatim(c,info,sdslen(info),"txt"); sdsfree(info); #else @@ -1669,6 +1668,7 @@ robj *deserializeStoredObjectCore(const void *data, size_t cb) robj *deserializeStoredObject(const redisDbPersistentData *db, const char *key, const void *data, size_t cb) { robj *o = deserializeStoredObjectCore(data, cb); + std::unique_lock ul(g_expireLock); o->SetFExpires(db->setexpire()->exists(key)); return o; } diff --git a/src/rdb.cpp b/src/rdb.cpp index e7970569a..3f84a6b23 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -1163,7 +1163,7 @@ int rdbSaveKeyValuePair(rio *rdb, robj_roptr key, robj_roptr val, const expireEn char szT[32]; if (g_pserver->fActiveReplica) { - snprintf(szT, 32, "%" PRIu64, mvccFromObj(val)); + snprintf(szT, sizeof(szT), "%" PRIu64, mvccFromObj(val)); if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szT) == -1) return -1; } @@ -1190,7 +1190,7 @@ int rdbSaveKeyValuePair(rio *rdb, robj_roptr key, robj_roptr val, const expireEn { if (itr.subkey() == nullptr) continue; // already saved - snprintf(szT, 32, "%lld", itr.when()); + snprintf(szT, sizeof(szT), "%lld", itr.when()); rdbSaveAuxFieldStrStr(rdb,"keydb-subexpire-key",itr.subkey()); rdbSaveAuxFieldStrStr(rdb,"keydb-subexpire-when",szT); } @@ -1334,7 +1334,7 @@ int rdbSaveRio(rio *rdb, const redisDbPersistentDataSnapshot **rgpdb, int *error if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_BEFORE_RDB) == -1) goto werr; for (j = 0; j < cserver.dbnum; j++) { - const redisDbPersistentDataSnapshot *db = rgpdb[j]; + const redisDbPersistentDataSnapshot *db = rgpdb != nullptr ? rgpdb[j] : g_pserver->db[j]; if (db->size() == 0) continue; /* Write the SELECT DB opcode */ @@ -1490,7 +1490,7 @@ int rdbSaveFile(char *filename, const redisDbPersistentDataSnapshot **rgpdb, rdb rio rdb; int error = 0; - snprintf(tmpfile,256,"temp-%d-%d.rdb", getpid(), g_pserver->rdbThreadVars.tmpfileNum); + getTempFileName(tmpfile, g_pserver->rdbThreadVars.tmpfileNum); fp = fopen(tmpfile,"w"); if (!fp) { char *cwdp = getcwd(cwd,MAXPATHLEN); @@ -1660,12 +1660,14 @@ int launchRdbSaveThread(pthread_t &child, rdbSaveInfo *rsi) pthread_attr_t tattr; pthread_attr_init(&tattr); pthread_attr_setstacksize(&tattr, 1 << 23); // 8 MB + openChildInfoPipe(); if (pthread_create(&child, &tattr, rdbSaveThread, args)) { pthread_attr_destroy(&tattr); for (int idb = 0; idb < cserver.dbnum; ++idb) g_pserver->db[idb]->endSnapshot(args->rgpdb[idb]); args->~rdbSaveThreadArgs(); zfree(args); + closeChildInfoPipe(); return C_ERR; } pthread_attr_destroy(&tattr); @@ -1679,24 +1681,23 @@ int rdbSaveBackground(rdbSaveInfo *rsi) { pthread_t child; long long start; - if (hasActiveChildProcess()) return C_ERR; + if (hasActiveChildProcessOrBGSave()) return C_ERR; g_pserver->dirty_before_bgsave = g_pserver->dirty; g_pserver->lastbgsave_try = time(NULL); - openChildInfoPipe(); start = ustime(); + latencyStartMonitor(g_pserver->rdb_save_latency); - - g_pserver->stat_fork_time = ustime()-start; - g_pserver->stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / g_pserver->stat_fork_time / (1024*1024*1024); /* GB per second. */ if (launchRdbSaveThread(child, rsi) != C_OK) { - closeChildInfoPipe(); g_pserver->lastbgsave_status = C_ERR; serverLog(LL_WARNING,"Can't save in background: fork: %s", strerror(errno)); return C_ERR; } + + g_pserver->stat_fork_time = ustime()-start; + g_pserver->stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / g_pserver->stat_fork_time / (1024*1024*1024); /* GB per second. */ latencyAddSampleIfNeeded("fork",g_pserver->stat_fork_time/1000); serverLog(LL_NOTICE,"Background saving started"); g_pserver->rdb_save_time_start = time(NULL); @@ -1714,7 +1715,7 @@ void getTempFileName(char tmpfile[], int tmpfileNum) { char tmpfileNumString[214]; /* Generate temp rdb file name using aync-signal safe functions. */ - int pid_len = ll2string(pid, sizeof(pid), getpid()); + int pid_len = ll2string(pid, sizeof(pid), g_pserver->in_fork_child ? getppid() : getpid()); int tmpfileNum_len = ll2string(tmpfileNumString, sizeof(tmpfileNumString), tmpfileNum); strcpy(tmpfile, "temp-"); strncpy(tmpfile+5, pid, pid_len); @@ -3115,13 +3116,12 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { std::unique_ptr spjob; // If we're tracking changes we need to reset this - bool fTracking = g_pserver->db[0]->FTrackingChanges(); - if (fTracking) { - // We don't want to track here because processChangesAsync is outside the normal scope handling - for (int idb = 0; idb < cserver.dbnum; ++idb) { + std::vector fTracking(cserver.dbnum); + // We don't want to track here because processChangesAsync is outside the normal scope handling + for (int idb = 0; idb < cserver.dbnum; ++idb) { + if ((fTracking[idb] = g_pserver->db[idb]->FTrackingChanges())) if (g_pserver->db[idb]->processChanges(false)) g_pserver->db[idb]->commitChanges(); - } } rdb->update_cksum = rdbLoadProgressCallback; @@ -3489,11 +3489,10 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { } wqueue.endWork(); - if (fTracking) { - // Reset track changes - for (int idb = 0; idb < cserver.dbnum; ++idb) { + // Reset track changes + for (int idb = 0; idb < cserver.dbnum; ++idb) { + if (fTracking[idb]) g_pserver->db[idb]->trackChanges(false); - } } if (empty_keys_skipped) { serverLog(LL_WARNING, @@ -3511,11 +3510,10 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { * the RDB file from a socket during initial SYNC (diskless replica mode), * we'll report the error to the caller, so that we can retry. */ eoferr: - if (fTracking) { // Reset track changes - for (int idb = 0; idb < cserver.dbnum; ++idb) { + for (int idb = 0; idb < cserver.dbnum; ++idb) { + if (fTracking[idb]) g_pserver->db[idb]->trackChanges(false); - } } wqueue.endWork(); @@ -3604,6 +3602,8 @@ static void backgroundSaveDoneHandlerDisk(int exitcode, bool fCancelled) { g_pserver->dirty = g_pserver->dirty - g_pserver->dirty_before_bgsave; g_pserver->lastsave = time(NULL); g_pserver->lastbgsave_status = C_OK; + latencyEndMonitor(g_pserver->rdb_save_latency); + latencyAddSampleIfNeeded("rdb-save",g_pserver->rdb_save_latency); } else if (!fCancelled && exitcode != 0) { serverLog(LL_WARNING, "Background saving error"); g_pserver->lastbgsave_status = C_ERR; @@ -3694,7 +3694,7 @@ void killRDBChild(bool fSynchronous) { serverAssert(GlobalLocksAcquired()); if (cserver.fForkBgSave) { - kill(g_pserver->rdb_child_pid,SIGUSR1); + kill(g_pserver->child_pid,SIGUSR1); } else { g_pserver->rdbThreadVars.fRdbThreadCancel = true; if (g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET) { @@ -3785,7 +3785,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { int pipefds[2]; rdbSaveSocketThreadArgs *args = nullptr; - if (hasActiveChildProcess()) return C_ERR; + if (hasActiveChildProcessOrBGSave()) return C_ERR; /* Even if the previous fork child exited, don't start a new one until we * drained the pipe. */ @@ -3832,57 +3832,130 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { } /* Create the child process. */ - openChildInfoPipe(); - - for (int idb = 0; idb < cserver.dbnum; ++idb) - args->rgpdb[idb] = g_pserver->db[idb]->createSnapshot(getMvccTstamp(), false /*fOptional*/); - - g_pserver->rdbThreadVars.tmpfileNum++; - g_pserver->rdbThreadVars.fRdbThreadCancel = false; - pthread_attr_t tattr; - pthread_attr_init(&tattr); - pthread_attr_setstacksize(&tattr, 1 << 23); // 8 MB - if (pthread_create(&child, &tattr, rdbSaveToSlavesSocketsThread, args)) { - pthread_attr_destroy(&tattr); - serverLog(LL_WARNING,"Can't save in background: fork: %s", - strerror(errno)); - - /* Undo the state change. The caller will perform cleanup on - * all the slaves in BGSAVE_START state, but an early call to - * replicationSetupSlaveForFullResync() turned it into BGSAVE_END */ - listRewind(g_pserver->slaves,&li); - while((ln = listNext(&li))) { - client *replica = (client*)ln->value; - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { - replica->replstate = SLAVE_STATE_WAIT_BGSAVE_START; + if (cserver.fForkBgSave) { + pid_t childpid; + if ((childpid = redisFork(CHILD_TYPE_RDB)) == 0) { + /* Child */ + int retval, dummy; + rio rdb; + + rioInitWithFd(&rdb,args->rdb_pipe_write); + + redisSetProcTitle("keydb-rdb-to-slaves"); + redisSetCpuAffinity(g_pserver->bgsave_cpulist); + + retval = rdbSaveRioWithEOFMark(&rdb,nullptr,nullptr,rsi); + if (retval == C_OK && rioFlush(&rdb) == 0) + retval = C_ERR; + + if (retval == C_OK) { + sendChildCowInfo(CHILD_INFO_TYPE_RDB_COW_SIZE, "RDB"); + } + + rioFreeFd(&rdb); + /* wake up the reader, tell it we're done. */ + close(args->rdb_pipe_write); + close(g_pserver->rdb_child_exit_pipe); /* close write end so that we can detect the close on the parent. */ + /* hold exit until the parent tells us it's safe. we're not expecting + * to read anything, just get the error when the pipe is closed. */ + dummy = read(args->safe_to_exit_pipe, pipefds, 1); + UNUSED(dummy); + exitFromChild((retval == C_OK) ? 0 : 1); + } else { + /* Parent */ + close(args->safe_to_exit_pipe); + if (childpid == -1) { + serverLog(LL_WARNING,"Can't save in background: fork: %s", + strerror(errno)); + + /* Undo the state change. The caller will perform cleanup on + * all the slaves in BGSAVE_START state, but an early call to + * replicationSetupSlaveForFullResync() turned it into BGSAVE_END */ + listRewind(g_pserver->slaves,&li); + while((ln = listNext(&li))) { + client *replica = (client*)ln->value; + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { + replica->replstate = SLAVE_STATE_WAIT_BGSAVE_START; + } + } + close(args->rdb_pipe_write); + close(g_pserver->rdb_pipe_read); + zfree(g_pserver->rdb_pipe_conns); + g_pserver->rdb_pipe_conns = NULL; + g_pserver->rdb_pipe_numconns = 0; + g_pserver->rdb_pipe_numconns_writing = 0; + args->rsi.~rdbSaveInfo(); + zfree(args); + } else { + serverLog(LL_NOTICE,"Background RDB transfer started by pid %ld", + (long)childpid); + g_pserver->rdb_save_time_start = time(NULL); + g_pserver->rdb_child_type = RDB_CHILD_TYPE_SOCKET; + g_pserver->rdbThreadVars.fRdbThreadActive = true; + updateDictResizePolicy(); + close(args->rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */ + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, []{ + if (aeCreateFileEvent(serverTL->el, g_pserver->rdb_pipe_read, AE_READABLE, rdbPipeReadHandler, nullptr) == AE_ERR) { + serverPanic("Unrecoverable error creating g_pserver->rdb_pipe_read file event."); + } + }); } + return (childpid == -1) ? C_ERR : C_OK; } - close(args->rdb_pipe_write); - close(g_pserver->rdb_pipe_read); - zfree(g_pserver->rdb_pipe_conns); - close(args->safe_to_exit_pipe); - g_pserver->rdb_pipe_conns = NULL; - g_pserver->rdb_pipe_numconns = 0; - g_pserver->rdb_pipe_numconns_writing = 0; - args->rsi.~rdbSaveInfo(); - zfree(args); - closeChildInfoPipe(); - return C_ERR; } - pthread_attr_destroy(&tattr); - g_pserver->child_type = CHILD_TYPE_RDB; + else { + openChildInfoPipe(); - serverLog(LL_NOTICE,"Background RDB transfer started"); - g_pserver->rdb_save_time_start = time(NULL); - serverAssert(!g_pserver->rdbThreadVars.fRdbThreadActive); - g_pserver->rdbThreadVars.rdb_child_thread = child; - g_pserver->rdbThreadVars.fRdbThreadActive = true; - g_pserver->rdb_child_type = RDB_CHILD_TYPE_SOCKET; - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, []{ - if (aeCreateFileEvent(serverTL->el, g_pserver->rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) { - serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); + for (int idb = 0; idb < cserver.dbnum; ++idb) + args->rgpdb[idb] = g_pserver->db[idb]->createSnapshot(getMvccTstamp(), false /*fOptional*/); + + g_pserver->rdbThreadVars.tmpfileNum++; + g_pserver->rdbThreadVars.fRdbThreadCancel = false; + pthread_attr_t tattr; + pthread_attr_init(&tattr); + pthread_attr_setstacksize(&tattr, 1 << 23); // 8 MB + if (pthread_create(&child, &tattr, rdbSaveToSlavesSocketsThread, args)) { + pthread_attr_destroy(&tattr); + serverLog(LL_WARNING,"Can't save in background: fork: %s", + strerror(errno)); + + /* Undo the state change. The caller will perform cleanup on + * all the slaves in BGSAVE_START state, but an early call to + * replicationSetupSlaveForFullResync() turned it into BGSAVE_END */ + listRewind(g_pserver->slaves,&li); + while((ln = listNext(&li))) { + client *replica = (client*)ln->value; + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { + replica->replstate = SLAVE_STATE_WAIT_BGSAVE_START; + } + } + close(args->rdb_pipe_write); + close(g_pserver->rdb_pipe_read); + zfree(g_pserver->rdb_pipe_conns); + close(args->safe_to_exit_pipe); + g_pserver->rdb_pipe_conns = NULL; + g_pserver->rdb_pipe_numconns = 0; + g_pserver->rdb_pipe_numconns_writing = 0; + args->rsi.~rdbSaveInfo(); + zfree(args); + closeChildInfoPipe(); + return C_ERR; } - }); + pthread_attr_destroy(&tattr); + g_pserver->child_type = CHILD_TYPE_RDB; + + serverLog(LL_NOTICE,"Background RDB transfer started"); + g_pserver->rdb_save_time_start = time(NULL); + serverAssert(!g_pserver->rdbThreadVars.fRdbThreadActive); + g_pserver->rdbThreadVars.rdb_child_thread = child; + g_pserver->rdbThreadVars.fRdbThreadActive = true; + g_pserver->rdb_child_type = RDB_CHILD_TYPE_SOCKET; + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, []{ + if (aeCreateFileEvent(serverTL->el, g_pserver->rdb_pipe_read, AE_READABLE, rdbPipeReadHandler, nullptr) == AE_ERR) { + serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); + } + }); + } return C_OK; /* Unreached. */ } diff --git a/src/rdb.h b/src/rdb.h index 2e017e4e0..68dc7a138 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -151,6 +151,7 @@ int rdbLoad(rdbSaveInfo *rsi, int rdbflags); int rdbLoadFile(const char *filename, rdbSaveInfo *rsi, int rdbflags); int rdbSaveBackground(rdbSaveInfo *rsi); int rdbSaveToSlavesSockets(rdbSaveInfo *rsi); +void getTempFileName(char tmpfile[], int tmpfileNum); void rdbRemoveTempFile(pid_t childpid, int from_signal); int rdbSave(const redisDbPersistentDataSnapshot **rgpdb, rdbSaveInfo *rsi); int rdbSaveFile(char *filename, const redisDbPersistentDataSnapshot **rgpdb, rdbSaveInfo *rsi); diff --git a/src/readwritelock.h b/src/readwritelock.h index a7318a29f..79f0ac710 100644 --- a/src/readwritelock.h +++ b/src/readwritelock.h @@ -8,6 +8,7 @@ class readWriteLock { int m_readCount = 0; int m_writeCount = 0; bool m_writeWaiting = false; + bool m_notify = true; public: readWriteLock(const char *name) : m_readLock(name), m_writeLock(name) {} @@ -65,7 +66,8 @@ class readWriteLock { void releaseRead() { std::unique_lock rm(m_readLock); m_readCount--; - m_cv.notify_all(); + if (m_notify) + m_cv.notify_all(); } void releaseWrite(bool exclusive = true) { @@ -74,7 +76,8 @@ class readWriteLock { if (exclusive) m_writeLock.unlock(); m_writeCount--; - m_cv.notify_all(); + if (m_notify) + m_cv.notify_all(); } void downgradeWrite(bool exclusive = true) { @@ -82,6 +85,10 @@ class readWriteLock { acquireRead(); } + void setNotify(bool notify) { + m_notify = notify; + } + bool hasReader() { return m_readCount > 0; } diff --git a/src/redis-cli-cpphelper.cpp b/src/redis-cli-cpphelper.cpp index 4fdcef8ec..aabab2062 100644 --- a/src/redis-cli-cpphelper.cpp +++ b/src/redis-cli-cpphelper.cpp @@ -445,7 +445,7 @@ extern "C" void clusterManagerWaitForClusterJoin(void) { int counter = 0, check_after = CLUSTER_JOIN_CHECK_AFTER + (int)(listLength(cluster_manager.nodes) * 0.15f); - while(!clusterManagerIsConfigConsistent()) { + while(!clusterManagerIsConfigConsistent(0 /*fLog*/)) { printf("."); fflush(stdout); sleep(1); @@ -588,7 +588,7 @@ extern "C" int clusterManagerCheckCluster(int quiet) { int do_fix = config.cluster_manager_command.flags & CLUSTER_MANAGER_CMD_FLAG_FIX; if (!quiet) clusterManagerShowNodes(); - consistent = clusterManagerIsConfigConsistent(); + consistent = clusterManagerIsConfigConsistent(1 /*fLog*/); if (!consistent) { sds err = sdsnew("[ERR] Nodes don't agree about configuration!"); clusterManagerOnError(err); diff --git a/src/redis-cli.c b/src/redis-cli.c index a03194659..1b4494c95 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -1618,6 +1618,8 @@ static int parseOptions(int argc, char **argv) { fprintf(stderr, "Unknown --show-pushes value '%s' " "(valid: '[y]es', '[n]o')\n", argval); } + } else if (!strcmp(argv[i],"--force")) { + config.force_mode = 1; } else if (CLUSTER_MANAGER_MODE() && argv[i][0] != '-') { if (config.cluster_manager_command.argc == 0) { int j = i + 1; @@ -1793,6 +1795,7 @@ static void usage(void) { " --verbose Verbose mode.\n" " --no-auth-warning Don't show warning message when using password on command\n" " line interface.\n" +" --force Ignore validation and safety checks\n" " --help Output this help and exit.\n" " --version Output version and exit.\n" "\n"); @@ -3293,8 +3296,8 @@ static redisReply *clusterManagerMigrateKeysInReply(clusterManagerNode *source, argv_len = zcalloc(argc * sizeof(size_t), MALLOC_LOCAL); char portstr[255]; char timeoutstr[255]; - snprintf(portstr, 10, "%d", target->port); - snprintf(timeoutstr, 10, "%d", timeout); + snprintf(portstr, sizeof(portstr), "%d", target->port); + snprintf(timeoutstr, sizeof(timeoutstr), "%d", timeout); argv[0] = "MIGRATE"; argv_len[0] = 7; argv[1] = target->ip; @@ -3993,12 +3996,13 @@ static sds clusterManagerGetConfigSignature(clusterManagerNode *node) { return signature; } -int clusterManagerIsConfigConsistent(void) { +int clusterManagerIsConfigConsistent(int fLog) { if (cluster_manager.nodes == NULL) return 0; int consistent = (listLength(cluster_manager.nodes) <= 1); // If the Cluster has only one node, it's always consistent if (consistent) return 1; sds first_cfg = NULL; + const char *firstNode = NULL; listIter li; listNode *ln; listRewind(cluster_manager.nodes, &li); @@ -4009,10 +4013,14 @@ int clusterManagerIsConfigConsistent(void) { consistent = 0; break; } - if (first_cfg == NULL) first_cfg = cfg; - else { + if (first_cfg == NULL) { + first_cfg = cfg; + firstNode = node->name; + } else { consistent = !sdscmp(first_cfg, cfg); sdsfree(cfg); + if (fLog && !consistent) + clusterManagerLogInfo("\tNode %s (%s:%d) is inconsistent with %s\n", node->name, node->ip, node->port, firstNode); if (!consistent) break; } } @@ -5161,7 +5169,7 @@ static int clusterManagerCommandReshard(int argc, char **argv) { clusterManagerNode *node = clusterManagerNewNode(ip, port); if (!clusterManagerLoadInfoFromNode(node, 0)) return 0; clusterManagerCheckCluster(0); - if (cluster_manager.errors && listLength(cluster_manager.errors) > 0) { + if (cluster_manager.errors && listLength(cluster_manager.errors) > 0 && !config.force_mode) { fflush(stdout); fprintf(stderr, "*** Please fix your cluster problems before resharding\n"); @@ -5394,7 +5402,7 @@ static int clusterManagerCommandRebalance(int argc, char **argv) { if (weightedNodes == NULL) goto cleanup; /* Check cluster, only proceed if it looks sane. */ clusterManagerCheckCluster(1); - if (cluster_manager.errors && listLength(cluster_manager.errors) > 0) { + if (cluster_manager.errors && listLength(cluster_manager.errors) > 0 && !config.force_mode) { clusterManagerLogErr("*** Please fix your cluster problems " "before rebalancing\n"); result = 0; @@ -5485,8 +5493,8 @@ static int clusterManagerCommandRebalance(int argc, char **argv) { listAddNodeTail(lsrc, src); table = clusterManagerComputeReshardTable(lsrc, numslots); listRelease(lsrc); - int table_len = (int) listLength(table); - if (!table || table_len != numslots) { + int table_len = 0; + if (!table || (table_len = (int) listLength(table)) != numslots) { clusterManagerLogErr("*** Assertion failed: Reshard table " "!= number of slots"); result = 0; @@ -6825,7 +6833,7 @@ static long getLongInfoField(char *info, char *field) { /* Convert number of bytes into a human readable string of the form: * 100B, 2G, 100M, 4K, and so forth. */ -void bytesToHuman(char *s, long long n) { +void bytesToHuman(char *s, long long n, size_t bufsize) { double d; if (n < 0) { @@ -6835,17 +6843,17 @@ void bytesToHuman(char *s, long long n) { } if (n < 1024) { /* Bytes */ - sprintf(s,"%lldB",n); + snprintf(s,bufsize,"%lldB",n); return; } else if (n < (1024*1024)) { d = (double)n/(1024); - sprintf(s,"%.2fK",d); + snprintf(s,bufsize,"%.2fK",d); } else if (n < (1024LL*1024*1024)) { d = (double)n/(1024*1024); - sprintf(s,"%.2fM",d); + snprintf(s,bufsize,"%.2fM",d); } else if (n < (1024LL*1024*1024*1024)) { d = (double)n/(1024LL*1024*1024); - sprintf(s,"%.2fG",d); + snprintf(s,bufsize,"%.2fG",d); } } @@ -6875,38 +6883,38 @@ static void statMode(void) { for (j = 0; j < 20; j++) { long k; - sprintf(buf,"db%d:keys",j); + snprintf(buf,sizeof(buf),"db%d:keys",j); k = getLongInfoField(reply->str,buf); if (k == LONG_MIN) continue; aux += k; } - sprintf(buf,"%ld",aux); + snprintf(buf,sizeof(buf),"%ld",aux); printf("%-11s",buf); /* Used memory */ aux = getLongInfoField(reply->str,"used_memory"); - bytesToHuman(buf,aux); + bytesToHuman(buf,aux,sizeof(buf)); printf("%-8s",buf); /* Clients */ aux = getLongInfoField(reply->str,"connected_clients"); - sprintf(buf,"%ld",aux); + snprintf(buf,sizeof(buf),"%ld",aux); printf(" %-8s",buf); /* Blocked (BLPOPPING) Clients */ aux = getLongInfoField(reply->str,"blocked_clients"); - sprintf(buf,"%ld",aux); + snprintf(buf,sizeof(buf),"%ld",aux); printf("%-8s",buf); /* Requests */ aux = getLongInfoField(reply->str,"total_commands_processed"); - sprintf(buf,"%ld (+%ld)",aux,requests == 0 ? 0 : aux-requests); + snprintf(buf,sizeof(buf),"%ld (+%ld)",aux,requests == 0 ? 0 : aux-requests); printf("%-19s",buf); requests = aux; /* Connections */ aux = getLongInfoField(reply->str,"total_connections_received"); - sprintf(buf,"%ld",aux); + snprintf(buf,sizeof(buf),"%ld",aux); printf(" %-12s",buf); /* Children */ @@ -7185,6 +7193,7 @@ int main(int argc, char **argv) { config.set_errcode = 0; config.no_auth_warning = 0; config.in_multi = 0; + config.force_mode = 0; config.cluster_manager_command.name = NULL; config.cluster_manager_command.argc = 0; config.cluster_manager_command.argv = NULL; diff --git a/src/redis-cli.h b/src/redis-cli.h index 20ef1aba4..677f952c1 100644 --- a/src/redis-cli.h +++ b/src/redis-cli.h @@ -97,9 +97,6 @@ extern "C" { #define CC_FORCE (1<<0) /* Re-connect if already connected. */ #define CC_QUIET (1<<1) /* Don't log connecting errors. */ -struct clusterManagerLink; -typedef struct clusterManagerLink clusterManagerLink; - /* Dict Helpers */ uint64_t dictSdsHash(const void *key); @@ -198,6 +195,7 @@ extern struct config { int in_multi; int pre_multi_dbnum; int quoted_input; /* Force input args to be treated as quoted strings */ + int force_mode; } config; struct clusterManager { @@ -285,7 +283,7 @@ int clusterManagerFixOpenSlot(int slot); void clusterManagerPrintSlotsList(list *slots); int clusterManagerGetCoveredSlots(char *all_slots); void clusterManagerOnError(sds err); -int clusterManagerIsConfigConsistent(void); +int clusterManagerIsConfigConsistent(int fLog); void freeClusterManagerNode(clusterManagerNode *node); void clusterManagerLog(int level, const char* fmt, ...); int parseClusterNodeAddress(char *addr, char **ip_ptr, int *port_ptr, diff --git a/src/redismodule.h b/src/redismodule.h index 4313aee01..10736797a 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -326,6 +326,7 @@ static const RedisModuleEvent #define REDISMODULE_SUBEVENT_LOADING_ENDED 3 #define REDISMODULE_SUBEVENT_LOADING_FAILED 4 #define _REDISMODULE_SUBEVENT_LOADING_NEXT 5 +#define REDISMODULE_SUBEVENT_LOADING_FLASH_START 6 #define REDISMODULE_SUBEVENT_CLIENT_CHANGE_CONNECTED 0 #define REDISMODULE_SUBEVENT_CLIENT_CHANGE_DISCONNECTED 1 diff --git a/src/replication.cpp b/src/replication.cpp index 0ef43485b..ee3583692 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -190,7 +190,7 @@ int bg_unlink(const char *filename) { bool createDiskBacklog() { // Lets create some disk backed pages and add them here std::string path = "./repl-backlog-temp" + std::to_string(gettid()); -#ifdef __APPLE__ +#if (defined __APPLE__ || defined __FreeBSD__) int fd = open(path.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); #else int fd = open(path.c_str(), O_CREAT | O_RDWR | O_LARGEFILE, S_IRUSR | S_IWUSR); @@ -345,7 +345,8 @@ void freeReplicationBacklog(void) { client *c = (client*)listNodeValue(ln); serverAssert(c->flags & CLIENT_CLOSE_ASAP || FMasterHost(c)); } - zfree(g_pserver->repl_backlog); + if (g_pserver->repl_backlog != g_pserver->repl_backlog_disk) + zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = NULL; } @@ -410,7 +411,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) { if (minimumsize > g_pserver->repl_backlog_size && listening_replicas) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); - serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld bytes", newsize); + serverLog(LL_WARNING, "Replication backlog is too small, resizing from %lld to %lld bytes", g_pserver->repl_backlog_size, newsize); resizeReplicationBacklog(newsize); } else if (!listening_replicas) { // We need to update a few variables or later asserts will notice we dropped data @@ -1132,9 +1133,17 @@ class replicationBuffer { replica->repl_put_online_on_ack = 1; } } + + void abort() { + for (auto replica : replicas) { + // Close the connection to force a resync + freeClientAsync(replica); + } + replicas.clear(); + } }; -int rdbSaveSnapshotForReplication(struct rdbSaveInfo *rsi) { +int rdbSaveSnapshotForReplication(rdbSaveInfo *rsi) { // TODO: This needs to be on a background thread int retval = C_OK; serverAssert(GlobalLocksAcquired()); @@ -1227,7 +1236,11 @@ int rdbSaveSnapshotForReplication(struct rdbSaveInfo *rsi) { retval = C_ERR; break; } - serverAssert(count == snapshotDeclaredCount); + if (count != snapshotDeclaredCount) { + serverLog(LL_WARNING, "Replication BUG: Count of keys sent does not match actual count. Aborting full sync."); + replBuf.abort(); + break; + } } replBuf.end(); @@ -1526,7 +1539,7 @@ void syncCommand(client *c) { } else { /* We don't have a BGSAVE in progress, let's start one. Diskless * or disk-based mode is determined by replica's capacity. */ - if (!hasActiveChildProcess()) { + if (!hasActiveChildProcessOrBGSave()) { startBgsaveForReplication(c->slave_capa); } else { serverLog(LL_NOTICE, @@ -2977,10 +2990,11 @@ bool readSyncBulkPayloadRdb(connection *conn, redisMaster *mi, rdbSaveInfo &rsi, } void readSyncBulkPayload(connection *conn) { + serverAssert(GlobalLocksAcquired()); rdbSaveInfo rsi; redisMaster *mi = (redisMaster*)connGetPrivateData(conn); static int usemark = 0; - if (mi == nullptr) { + if (mi == nullptr || conn != mi->repl_transfer_s) { // We're about to be free'd so bail out return; } @@ -2993,6 +3007,9 @@ void readSyncBulkPayload(connection *conn) { return; } + if (conn != mi->repl_transfer_s) + return; + /* Final setup of the connected slave <- master link */ replicationCreateMasterClient(mi,mi->repl_transfer_s,rsi.repl_stream_db); if (mi->isRocksdbSnapshotRepl) { @@ -3716,11 +3733,11 @@ void syncWithMaster(connection *conn) { } /* Prepare a suitable temp file for bulk transfer */ - if (!useDisklessLoad()) { + if (!useDisklessLoad() && !mi->isRocksdbSnapshotRepl) { while(maxtries--) { auto dt = std::chrono::system_clock::now().time_since_epoch(); auto dtMillisecond = std::chrono::duration_cast(dt); - snprintf(tmpfile,256, + snprintf(tmpfile,sizeof(tmpfile), "temp-%d.%ld.rdb",(int)dtMillisecond.count(),(long int)getpid()); dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644); if (dfd != -1) break; @@ -3781,9 +3798,11 @@ int connectWithMaster(redisMaster *mi) { connSetPrivateData(mi->repl_transfer_s, mi); if (connConnect(mi->repl_transfer_s, mi->masterhost, mi->masterport, NET_FIRST_BIND_ADDR, syncWithMaster) == C_ERR) { - int sev = g_pserver->enable_multimaster ? LL_NOTICE : LL_WARNING; // with multimaster its not unheard of to intentiallionall have downed masters - serverLog(sev,"Unable to connect to MASTER: %s", - connGetLastError(mi->repl_transfer_s)); + const char *err = "Unknown Error"; + if (mi->repl_transfer_s->last_errno != 0) + err = connGetLastError(mi->repl_transfer_s); + int sev = g_pserver->enable_multimaster ? LL_NOTICE : LL_WARNING; // with multimaster its not unheard of to intentionally have downed masters + serverLog(sev, "Unable to connect to MASTER: %s", err); connClose(mi->repl_transfer_s); mi->repl_transfer_s = NULL; return C_ERR; @@ -3801,12 +3820,14 @@ int connectWithMaster(redisMaster *mi) { * Never call this function directly, use cancelReplicationHandshake() instead. */ void undoConnectWithMaster(redisMaster *mi) { + serverAssert(GlobalLocksAcquired()); auto conn = mi->repl_transfer_s; connSetPrivateData(conn, nullptr); - aePostFunction(g_pserver->rgthreadvar[mi->ielReplTransfer].el, [conn]{ - connClose(conn); - }); mi->repl_transfer_s = NULL; + int result = aePostFunction(g_pserver->rgthreadvar[mi->ielReplTransfer].el, [conn]{ + connClose(conn); + }, false); + serverAssert(result == AE_OK); } /* Abort the async download of the bulk dataset while SYNC-ing with master. @@ -3961,6 +3982,10 @@ void freeMasterInfo(redisMaster *mi) { sdsfree(mi->masterauth); zfree(mi->masteruser); + if (g_pserver->rdb_filename != nullptr && g_pserver->rdb_filename == mi->repl_transfer_tmpfile) { + unlink(g_pserver->rdb_filename); + g_pserver->rdb_filename = nullptr; + } if (mi->repl_transfer_tmpfile) zfree(mi->repl_transfer_tmpfile); delete mi->staleKeyMap; @@ -4962,7 +4987,7 @@ void replicationStartPendingFork(void) { * In case of diskless replication, we make sure to wait the specified * number of seconds (according to configuration) so that other slaves * have the time to arrive before we start streaming. */ - if (!hasActiveChildProcess()) { + if (!hasActiveChildProcessOrBGSave()) { time_t idle, max_idle = 0; int slaves_waiting = 0; int mincapa = -1; @@ -5630,7 +5655,10 @@ void flushReplBacklogToClients() replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ - serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + if (replica->repl_curr_off == g_pserver->master_repl_offset) { + serverLog(LL_DEBUG, "Pending write when it's on repl_offset=%lld", g_pserver->master_repl_offset); + continue; + } prepareClientToWrite(replica); } if (fAsyncWrite) diff --git a/src/scripting.cpp b/src/scripting.cpp index 3995eb8f2..f1772cd5c 100644 --- a/src/scripting.cpp +++ b/src/scripting.cpp @@ -1264,6 +1264,9 @@ void scriptingInit(int setup) { /* Finally set the table as 'redis' global var. */ lua_setglobal(lua,"redis"); + /* Set table as 'keydb' global var as well */ + lua_getglobal(lua,"redis"); + lua_setglobal(lua,"keydb"); /* Replace math.random and math.randomseed with our implementations. */ lua_getglobal(lua,"math"); diff --git a/src/semiorderedset.h b/src/semiorderedset.h index fe463fe31..2cd28b507 100644 --- a/src/semiorderedset.h +++ b/src/semiorderedset.h @@ -27,6 +27,7 @@ namespace keydbutils template<> size_t hash(const sdsview &); } +extern size_t g_semiOrderedSetTargetBucketSize; template class semiorderedset @@ -41,11 +42,14 @@ class semiorderedset size_t idxRehash = (1ULL << bits_min); int cfPauseRehash = 0; - constexpr size_t targetElementsPerBucket() + inline size_t targetElementsPerBucket() { // Aim for roughly 4 cache lines per bucket (determined by imperical testing) // lower values are faster but use more memory - return std::max((64/sizeof(T))*8, (size_t)2); + if (g_semiOrderedSetTargetBucketSize == 0) + return std::max((64/sizeof(T))*8, (size_t)2); + else + return g_semiOrderedSetTargetBucketSize; } public: @@ -240,14 +244,12 @@ class semiorderedset bool empty() const noexcept { return celem == 0; } size_t size() const noexcept { return celem; } - size_t bytes_used() const + size_t estimated_bytes_used() const { - size_t cb = sizeof(this) + (m_data.capacity()-m_data.size())*sizeof(T); - for (auto &vec : m_data) - { - if (vec != nullptr) - cb += vec->bytes_used(); - } + // This estimate does't include all the overhead of the internal vectors + size_t cb = sizeof(this) + + (m_data.capacity() * sizeof(m_data[0])) + + sizeof(T) * size(); return cb; } diff --git a/src/server.cpp b/src/server.cpp index a480476e3..b8dbbf5a8 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -70,6 +70,7 @@ #ifdef __linux__ #include #include +#include #endif int g_fTestMode = false; @@ -779,7 +780,7 @@ struct redisCommand redisCommandTable[] = { 0,NULL,0,0,0,0,0,0}, {"shutdown",shutdownCommand,-1, - "admin no-script ok-loading ok-stale", + "admin no-script ok-loading ok-stale noprop", 0,NULL,0,0,0,0,0,0}, {"lastsave",lastsaveCommand,1, @@ -1706,7 +1707,7 @@ int redisDbPersistentData::incrementallyRehash() { * for dict.c to resize the hash tables accordingly to the fact we have an * active fork child running. */ void updateDictResizePolicy(void) { - if (!hasActiveChildProcess() || (g_pserver->FRdbSaveInProgress() && !cserver.fForkBgSave)) + if (!hasActiveChildProcess()) dictEnableResize(); else dictDisableResize(); @@ -1725,7 +1726,11 @@ const char *strChildType(int type) { /* Return true if there are active children processes doing RDB saving, * AOF rewriting, or some side process spawned by a loaded module. */ int hasActiveChildProcess() { - return g_pserver->FRdbSaveInProgress() || g_pserver->child_pid != -1; + return g_pserver->child_pid != -1; +} + +int hasActiveChildProcessOrBGSave() { + return g_pserver->FRdbSaveInProgress() || hasActiveChildProcess(); } void resetChildState() { @@ -1954,6 +1959,16 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) { *out_usage = o; } +int closeClientOnOverload(client *c) { + if (g_pserver->overload_closed_clients > MAX_CLIENTS_SHED_PER_PERIOD) return false; + if (!g_pserver->is_overloaded) return false; + // Don't close masters, replicas, or pub/sub clients + if (c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED)) return false; + freeClient(c); + ++g_pserver->overload_closed_clients; + return true; +} + /* This function is called by serverCron() and is used in order to perform * operations on clients that are important to perform constantly. For instance * we use this function in order to disconnect clients after a timeout, including @@ -2024,6 +2039,7 @@ void clientsCron(int iel) { if (clientsCronTrackExpansiveClients(c, curr_peak_mem_usage_slot)) goto LContinue; if (clientsCronTrackClientsMemUsage(c)) goto LContinue; if (closeClientOnOutputBufferLimitReached(c, 0)) continue; // Client also free'd + if (closeClientOnOverload(c)) continue; LContinue: fastlock_unlock(&c->lock); } @@ -2074,7 +2090,7 @@ void databasesCron(bool fMainThread) { /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad * as will cause a lot of copy-on-write of memory pages. */ - if (!hasActiveChildProcess() || g_pserver->FRdbSaveInProgress()) { + if (!hasActiveChildProcess()) { /* We use global counters so if we stop the computation at a given * DB we'll be able to start from the successive in the next * cron loop iteration. */ @@ -2215,6 +2231,7 @@ void checkChildrenDone(void) { g_pserver->rdbThreadVars.fRdbThreadCancel = false; g_pserver->rdbThreadVars.fDone = false; if (exitcode == 0) receiveChildInfo(); + closeChildInfoPipe(); } } else if ((pid = waitpid(-1, &statloc, WNOHANG)) != 0) { @@ -2296,6 +2313,10 @@ void cronUpdateMemoryStats() { g_pserver->cron_malloc_stats.allocator_active = g_pserver->cron_malloc_stats.allocator_resident; if (!g_pserver->cron_malloc_stats.allocator_allocated) g_pserver->cron_malloc_stats.allocator_allocated = g_pserver->cron_malloc_stats.zmalloc_used; + + if (g_pserver->force_eviction_percent) { + g_pserver->cron_malloc_stats.sys_available = getMemAvailable(); + } } } @@ -2468,14 +2489,14 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { /* Start a scheduled AOF rewrite if this was requested by the user while * a BGSAVE was in progress. */ - if (!hasActiveChildProcess() && + if (!hasActiveChildProcessOrBGSave() && g_pserver->aof_rewrite_scheduled) { rewriteAppendOnlyFileBackground(); } /* Check if a background saving or AOF rewrite in progress terminated. */ - if (hasActiveChildProcess() || ldbPendingChildren()) + if (hasActiveChildProcessOrBGSave() || ldbPendingChildren()) { run_with_period(1000) receiveChildInfo(); checkChildrenDone(); @@ -2517,7 +2538,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { /* Trigger an AOF rewrite if needed. */ if (g_pserver->aof_state == AOF_ON && - !hasActiveChildProcess() && + !hasActiveChildProcessOrBGSave() && g_pserver->aof_rewrite_perc && g_pserver->aof_current_size > g_pserver->aof_rewrite_min_size) { @@ -2576,6 +2597,26 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { migrateCloseTimedoutSockets(); } + /* Check for CPU Overload */ + run_with_period(10'000) { + g_pserver->is_overloaded = false; + g_pserver->overload_closed_clients = 0; + static clock_t last = 0; + if (g_pserver->overload_protect_threshold > 0) { + clock_t cur = clock(); + double perc = static_cast(cur - last) / (CLOCKS_PER_SEC*10); + perc /= cserver.cthreads; + perc *= 100.0; + serverLog(LL_WARNING, "CPU Used: %.2f", perc); + if (perc > g_pserver->overload_protect_threshold) { + serverLog(LL_WARNING, "\tWARNING: CPU overload detected."); + g_pserver->is_overloaded = true; + } + last = cur; + } + } + + /* Tune the fastlock to CPU load */ run_with_period(30000) { /* Tune the fastlock to CPU load */ fastlock_auto_adjust_waits(); @@ -2601,7 +2642,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { * Note: this code must be after the replicationCron() call above so * make sure when refactoring this file to keep this order. This is useful * because we want to give priority to RDB savings for replication. */ - if (!hasActiveChildProcess() && + if (!hasActiveChildProcessOrBGSave() && g_pserver->rdb_bgsave_scheduled && (g_pserver->unixtime-g_pserver->lastbgsave_try > CONFIG_BGSAVE_RETRY_DELAY || g_pserver->lastbgsave_status == C_OK)) @@ -2648,6 +2689,26 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { } } + if (g_pserver->soft_shutdown) { + /* Loop through our clients list and see if there are any active clients */ + listIter li; + listNode *ln; + listRewind(g_pserver->clients, &li); + bool fActiveClient = false; + while ((ln = listNext(&li)) && !fActiveClient) { + client *c = (client*)listNodeValue(ln); + if (c->flags & CLIENT_IGNORE_SOFT_SHUTDOWN) + continue; + fActiveClient = true; + } + if (!fActiveClient) { + if (prepareForShutdown(SHUTDOWN_NOFLAGS) == C_OK) { + serverLog(LL_WARNING, "All active clients have disconnected while a soft shutdown is pending. Shutting down now."); + throw ShutdownException(); + } + } + } + g_pserver->cronloops++; return 1000/g_pserver->hz; } @@ -2775,6 +2836,8 @@ void beforeSleep(struct aeEventLoop *eventLoop) { AeLocker locker; int iel = ielFromEventLoop(eventLoop); + tlsProcessPendingData(); + locker.arm(); /* end any snapshots created by fast async commands */ @@ -2793,9 +2856,6 @@ void beforeSleep(struct aeEventLoop *eventLoop) { runAndPropogateToReplicas(processClients); - /* Handle precise timeouts of blocked clients. */ - handleBlockedClientsTimeout(); - /* Just call a subset of vital functions in case we are re-entering * the event loop from processEventsWhileBlocked(). Note that in this * case we keep track of the number of events we are processing, since @@ -2805,7 +2865,6 @@ void beforeSleep(struct aeEventLoop *eventLoop) { uint64_t processed = 0; int aof_state = g_pserver->aof_state; locker.disarm(); - processed += tlsProcessPendingData(); processed += handleClientsWithPendingWrites(iel, aof_state); locker.arm(); processed += freeClientsInAsyncFreeQueue(iel); @@ -2816,13 +2875,6 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Handle precise timeouts of blocked clients. */ handleBlockedClientsTimeout(); - /* Handle TLS pending data. (must be done before flushAppendOnlyFile) */ - if (tlsHasPendingData()) { - locker.disarm(); - tlsProcessPendingData(); - locker.arm(); - } - /* If tls still has pending unread data don't sleep at all. */ aeSetDontWait(eventLoop, tlsHasPendingData()); @@ -2888,7 +2940,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { static thread_local bool fFirstRun = true; // note: we also copy the DB pointer in case a DB swap is done while the lock is released std::vector vecdb; // note we cache the database pointer in case a dbswap is done while the lock is released - if (cserver.storage_memory_model == STORAGE_WRITETHROUGH && g_pserver->m_pstorageFactory != nullptr && !g_pserver->loading) + if (cserver.storage_memory_model == STORAGE_WRITETHROUGH && !g_pserver->loading) { if (!fFirstRun) { mstime_t storage_process_latency; @@ -3001,8 +3053,11 @@ void afterSleep(struct aeEventLoop *eventLoop) { serverAssert(serverTL->gcEpoch.isReset()); serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch(); + + aeAcquireLock(); for (int idb = 0; idb < cserver.dbnum; ++idb) g_pserver->db[idb]->trackChanges(false); + aeReleaseLock(); serverTL->disable_async_commands = false; } @@ -3862,9 +3917,25 @@ void initServer(void) { g_pserver->db = (redisDb**)zmalloc(sizeof(redisDb*)*cserver.dbnum, MALLOC_LOCAL); /* Create the Redis databases, and initialize other internal state. */ - for (int j = 0; j < cserver.dbnum; j++) { - g_pserver->db[j] = new (MALLOC_LOCAL) redisDb(); - g_pserver->db[j]->initialize(j); + if (g_pserver->m_pstorageFactory == nullptr) { + for (int j = 0; j < cserver.dbnum; j++) { + g_pserver->db[j] = new (MALLOC_LOCAL) redisDb(); + g_pserver->db[j]->initialize(j); + } + } else { + // Read FLASH metadata and load the appropriate dbid into each databse index, as each DB index can have different dbid mapped due to the swapdb command. + g_pserver->metadataDb = g_pserver->m_pstorageFactory->createMetadataDb(); + for (int idb = 0; idb < cserver.dbnum; ++idb) + { + int dbid = idb; + std::string dbid_key = "db-" + std::to_string(idb); + g_pserver->metadataDb->retrieve(dbid_key.c_str(), dbid_key.length(), [&](const char *, size_t, const void *data, size_t){ + dbid = *(int*)data; + }); + + g_pserver->db[idb] = new (MALLOC_LOCAL) redisDb(); + g_pserver->db[idb]->initialize(dbid); + } } for (int i = 0; i < MAX_EVENT_LOOPS; ++i) @@ -3926,8 +3997,6 @@ void initServer(void) { g_pserver->pubsub_channels = dictCreate(&keylistDictType,NULL); g_pserver->pubsub_patterns = dictCreate(&keylistDictType,NULL); g_pserver->cronloops = 0; - g_pserver->propagate_in_transaction = 0; - g_pserver->client_pause_in_transaction = 0; g_pserver->child_pid = -1; g_pserver->child_type = CHILD_TYPE_NONE; g_pserver->rdbThreadVars.fRdbThreadCancel = false; @@ -3967,6 +4036,8 @@ void initServer(void) { g_pserver->cron_malloc_stats.allocator_allocated = 0; g_pserver->cron_malloc_stats.allocator_active = 0; g_pserver->cron_malloc_stats.allocator_resident = 0; + g_pserver->cron_malloc_stats.sys_available = 0; + g_pserver->cron_malloc_stats.sys_total = g_pserver->force_eviction_percent ? getMemTotal() : 0; g_pserver->lastbgsave_status = C_OK; g_pserver->aof_last_write_status = C_OK; g_pserver->aof_last_write_errno = 0; @@ -3974,6 +4045,7 @@ void initServer(void) { g_pserver->mvcc_tstamp = 0; + /* Create the timer callback, this is our way to process many background * operations incrementally, like clients timeout, eviction of unaccessed * expired keys and so forth. */ @@ -4014,7 +4086,6 @@ void initServer(void) { latencyMonitorInit(); if (g_pserver->m_pstorageFactory) { - g_pserver->metadataDb = g_pserver->m_pstorageFactory->createMetadataDb(); if (g_pserver->metadataDb) { g_pserver->metadataDb->retrieve("repl-id", 7, [&](const char *, size_t, const void *data, size_t cb){ if (cb == sizeof(g_pserver->replid)) { @@ -4050,12 +4121,6 @@ void initServer(void) { } } - /* We have to initialize storage providers after the cluster has been initialized */ - for (int idb = 0; idb < cserver.dbnum; ++idb) - { - g_pserver->db[idb]->storageProviderInitialize(); - } - saveMasterStatusToStorage(false); // eliminate the repl-offset field /* Initialize ACL default password if it exists */ @@ -4068,6 +4133,15 @@ void initServer(void) { * Thread Local Storage initialization collides with dlopen call. * see: https://sourceware.org/bugzilla/show_bug.cgi?id=19329 */ void InitServerLast() { + + /* We have to initialize storage providers after the cluster has been initialized */ + moduleFireServerEvent(REDISMODULE_EVENT_LOADING, REDISMODULE_SUBEVENT_LOADING_FLASH_START, NULL); + for (int idb = 0; idb < cserver.dbnum; ++idb) + { + g_pserver->db[idb]->storageProviderInitialize(); + } + moduleFireServerEvent(REDISMODULE_EVENT_LOADING, REDISMODULE_SUBEVENT_LOADING_ENDED, NULL); + bioInit(); set_jemalloc_bg_thread(cserver.jemalloc_bg_thread); g_pserver->initial_memory_usage = zmalloc_used_memory(); @@ -4298,12 +4372,12 @@ void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc, * This way we'll deliver the MULTI/..../EXEC block as a whole and * both the AOF and the replication link will have the same consistency * and atomicity guarantees. */ - if (serverTL->in_exec && !g_pserver->propagate_in_transaction) + if (serverTL->in_exec && !serverTL->propagate_in_transaction) execCommandPropagateMulti(dbid); /* This needs to be unreachable since the dataset should be fixed during * client pause, otherwise data may be lossed during a failover. */ - serverAssert(!(areClientsPaused() && !g_pserver->client_pause_in_transaction)); + serverAssert(!(areClientsPaused() && !serverTL->client_pause_in_transaction)); if (g_pserver->aof_state != AOF_OFF && flags & PROPAGATE_AOF) feedAppendOnlyFile(cmd,dbid,argv,argc); @@ -4625,8 +4699,8 @@ void call(client *c, int flags) { /* Client pause takes effect after a transaction has finished. This needs * to be located after everything is propagated. */ - if (!serverTL->in_exec && g_pserver->client_pause_in_transaction) { - g_pserver->client_pause_in_transaction = 0; + if (!serverTL->in_exec && serverTL->client_pause_in_transaction) { + serverTL->client_pause_in_transaction = 0; } /* If the client has keys tracking enabled for client side caching, @@ -4717,8 +4791,9 @@ int processCommand(client *c, int callFlags) { /* Both EXEC and EVAL call call() directly so there should be * no way in_exec or in_eval or propagate_in_transaction is 1. * That is unless lua_timedout, in which case client may run - * some commands. */ - serverAssert(!g_pserver->propagate_in_transaction); + * some commands. Also possible that some other thread set + * propagate_in_transaction if this is an async command. */ + serverAssert(!serverTL->propagate_in_transaction); serverAssert(!serverTL->in_exec); serverAssert(!serverTL->in_eval); } @@ -5015,10 +5090,29 @@ int processCommand(client *c, int callFlags) { } else { /* If the command was replication or admin related we *must* flush our buffers first. This is in case something happens which would modify what we would send to replicas */ - if (c->cmd->flags & (CMD_MODULE | CMD_ADMIN)) flushReplBacklogToClients(); + if (c->flags & CLIENT_AUDIT_LOGGING){ + getKeysResult result = GETKEYS_RESULT_INIT; + int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + int *keyindex = result.keys; + + sds str = sdsempty(); + for (int j = 0; j < numkeys; j++) { + sdscatsds(str, (sds)ptrFromObj(c->argv[keyindex[j]])); + sdscat(str, " "); + } + + if (numkeys > 0) + { + serverLog(LL_NOTICE, "Audit Log: %s, cmd %s, keys: %s", c->fprint, c->cmd->name, str); + } else { + serverLog(LL_NOTICE, "Audit Log: %s, cmd %s", c->fprint, c->cmd->name); + } + sdsfree(str); + } + call(c,callFlags); c->woff = g_pserver->master_repl_offset; @@ -5053,6 +5147,8 @@ bool client::asyncCommand(std::function &&postFn) { serverAssert(FCorrectThread(this)); + if (serverTL->in_eval) + return false; // we cannot block clients in EVAL const redisDbPersistentDataSnapshot *snapshot = nullptr; if (!(this->flags & (CLIENT_MULTI | CLIENT_BLOCKED))) snapshot = this->db->createSnapshot(this->mvccCheckpoint, false /* fOptional */); @@ -5148,7 +5244,7 @@ int prepareForShutdown(int flags) { * to unlink file actully) in background thread. * The temp rdb file fd may won't be closed when redis exits quickly, * but OS will close this fd when process exits. */ - rdbRemoveTempFile(g_pserver->child_pid, 0); + rdbRemoveTempFile(g_pserver->rdbThreadVars.tmpfileNum, 0); } /* Kill module child if there is one. */ @@ -5297,6 +5393,11 @@ void pingCommand(client *c) { return; } + if (g_pserver->soft_shutdown && !(c->flags & CLIENT_IGNORE_SOFT_SHUTDOWN)) { + addReplyError(c, "-SHUTDOWN PENDING"); + return; + } + if (c->flags & CLIENT_PUBSUB && c->resp == 2) { addReply(c,shared.mbulkhdr[2]); addReplyBulkCBuffer(c,"pong",4); @@ -5443,30 +5544,30 @@ NULL /* Convert an amount of bytes into a human readable string in the form * of 100B, 2G, 100M, 4K, and so forth. */ -void bytesToHuman(char *s, unsigned long long n) { +void bytesToHuman(char *s, unsigned long long n, size_t bufsize) { double d; if (n < 1024) { /* Bytes */ - sprintf(s,"%lluB",n); + snprintf(s,bufsize,"%lluB",n); } else if (n < (1024*1024)) { d = (double)n/(1024); - sprintf(s,"%.2fK",d); + snprintf(s,bufsize,"%.2fK",d); } else if (n < (1024LL*1024*1024)) { d = (double)n/(1024*1024); - sprintf(s,"%.2fM",d); + snprintf(s,bufsize,"%.2fM",d); } else if (n < (1024LL*1024*1024*1024)) { d = (double)n/(1024LL*1024*1024); - sprintf(s,"%.2fG",d); + snprintf(s,bufsize,"%.2fG",d); } else if (n < (1024LL*1024*1024*1024*1024)) { d = (double)n/(1024LL*1024*1024*1024); - sprintf(s,"%.2fT",d); + snprintf(s,bufsize,"%.2fT",d); } else if (n < (1024LL*1024*1024*1024*1024*1024)) { d = (double)n/(1024LL*1024*1024*1024*1024); - sprintf(s,"%.2fP",d); + snprintf(s,bufsize,"%.2fP",d); } else { /* Let's hope we never need this */ - sprintf(s,"%lluB",n); + snprintf(s,bufsize,"%lluB",n); } } @@ -5559,7 +5660,8 @@ sds genRedisInfoString(const char *section) { "configured_hz:%i\r\n" "lru_clock:%u\r\n" "executable:%s\r\n" - "config_file:%s\r\n", + "config_file:%s\r\n" + "availability_zone:%s\r\n", KEYDB_SET_VERSION, redisGitSHA1(), strtol(redisGitDirty(),NULL,10) > 0, @@ -5585,7 +5687,8 @@ sds genRedisInfoString(const char *section) { g_pserver->config_hz, lruclock, cserver.executable ? cserver.executable : "", - cserver.configfile ? cserver.configfile : ""); + cserver.configfile ? cserver.configfile : "", + g_pserver->sdsAvailabilityZone); } /* Clients */ @@ -5634,6 +5737,7 @@ sds genRedisInfoString(const char *section) { const char *evict_policy = evictPolicyToString(); long long memory_lua = g_pserver->lua ? (long long)lua_gc(g_pserver->lua,LUA_GCCOUNT,0)*1024 : 0; struct redisMemOverhead *mh = getMemoryOverheadData(); + char available_system_mem[64] = "unavailable"; /* Peak memory is updated from time to time by serverCron() so it * may happen that the instantaneous value is slightly bigger than @@ -5642,13 +5746,17 @@ sds genRedisInfoString(const char *section) { if (zmalloc_used > g_pserver->stat_peak_memory) g_pserver->stat_peak_memory = zmalloc_used; - bytesToHuman(hmem,zmalloc_used); - bytesToHuman(peak_hmem,g_pserver->stat_peak_memory); - bytesToHuman(total_system_hmem,total_system_mem); - bytesToHuman(used_memory_lua_hmem,memory_lua); - bytesToHuman(used_memory_scripts_hmem,mh->lua_caches); - bytesToHuman(used_memory_rss_hmem,g_pserver->cron_malloc_stats.process_rss); - bytesToHuman(maxmemory_hmem,g_pserver->maxmemory); + if (g_pserver->cron_malloc_stats.sys_available) { + snprintf(available_system_mem, 64, "%lu", g_pserver->cron_malloc_stats.sys_available); + } + + bytesToHuman(hmem,zmalloc_used,sizeof(hmem)); + bytesToHuman(peak_hmem,g_pserver->stat_peak_memory,sizeof(peak_hmem)); + bytesToHuman(total_system_hmem,total_system_mem,sizeof(total_system_hmem)); + bytesToHuman(used_memory_lua_hmem,memory_lua,sizeof(used_memory_lua_hmem)); + bytesToHuman(used_memory_scripts_hmem,mh->lua_caches,sizeof(used_memory_scripts_hmem)); + bytesToHuman(used_memory_rss_hmem,g_pserver->cron_malloc_stats.process_rss,sizeof(used_memory_rss_hmem)); + bytesToHuman(maxmemory_hmem,g_pserver->maxmemory,sizeof(maxmemory_hmem)); if (sections++) info = sdscat(info,"\r\n"); info = sdscatprintf(info, @@ -5694,7 +5802,8 @@ sds genRedisInfoString(const char *section) { "active_defrag_running:%d\r\n" "lazyfree_pending_objects:%zu\r\n" "lazyfreed_objects:%zu\r\n" - "storage_provider:%s\r\n", + "storage_provider:%s\r\n" + "available_system_memory:%s\r\n", zmalloc_used, hmem, g_pserver->cron_malloc_stats.process_rss, @@ -5739,18 +5848,10 @@ sds genRedisInfoString(const char *section) { g_pserver->active_defrag_running, lazyfreeGetPendingObjectsCount(), lazyfreeGetFreedObjectsCount(), - g_pserver->m_pstorageFactory ? g_pserver->m_pstorageFactory->name() : "none" + g_pserver->m_pstorageFactory ? g_pserver->m_pstorageFactory->name() : "none", + available_system_mem ); freeMemoryOverheadData(mh); - - if (g_pserver->m_pstorageFactory) - { - info = sdscatprintf(info, - "%s_memory:%zu\r\n", - g_pserver->m_pstorageFactory->name(), - g_pserver->m_pstorageFactory->totalDiskspaceUsed() - ); - } } /* Persistence */ @@ -5874,6 +5975,10 @@ sds genRedisInfoString(const char *section) { (intmax_t)eta ); } + if (g_pserver->m_pstorageFactory) + { + info = sdscat(info, g_pserver->m_pstorageFactory->getInfo().get()); + } } /* Stats */ @@ -6535,7 +6640,8 @@ void usage(void) { void redisAsciiArt(void) { #include "asciilogo.h" - char *buf = (char*)zmalloc(1024*16, MALLOC_LOCAL); + size_t bufsize = 1024*16; + char *buf = (char*)zmalloc(bufsize, MALLOC_LOCAL); const char *mode; if (g_pserver->cluster_enabled) mode = "cluster"; @@ -6557,7 +6663,7 @@ void redisAsciiArt(void) { ); } else { sds motd = fetchMOTD(true, cserver.enable_motd); - snprintf(buf,1024*16,ascii_logo, + snprintf(buf,bufsize,ascii_logo, KEYDB_REAL_VERSION, redisGitSHA1(), strtol(redisGitDirty(),NULL,10) > 0, @@ -6691,7 +6797,7 @@ static void sigShutdownHandler(int sig) { * If we receive the signal the second time, we interpret this as * the user really wanting to quit ASAP without waiting to persist * on disk. */ - if (g_pserver->shutdown_asap && sig == SIGINT) { + if ((g_pserver->shutdown_asap || g_pserver->soft_shutdown) && sig == SIGINT) { serverLogFromHandler(LL_WARNING, "You insist... exiting now."); rdbRemoveTempFile(g_pserver->rdbThreadVars.tmpfileNum, 1); g_pserver->garbageCollector.shutdown(); @@ -6702,7 +6808,10 @@ static void sigShutdownHandler(int sig) { } serverLogFromHandler(LL_WARNING, msg); - g_pserver->shutdown_asap = 1; + if (g_pserver->config_soft_shutdown) + g_pserver->soft_shutdown = true; + else + g_pserver->shutdown_asap = 1; } void setupSignalHandlers(void) { @@ -6857,6 +6966,7 @@ int redisFork(int purpose) { latencyAddSampleIfNeeded("fork-lock",(ustime()-startWriteLock)/1000); if ((childpid = fork()) == 0) { /* Child */ + aeForkLockInChild(); aeReleaseForkLock(); g_pserver->in_fork_child = purpose; setOOMScoreAdj(CONFIG_OOM_BGCHILD); @@ -7299,7 +7409,7 @@ static void validateConfiguration() exit(EXIT_FAILURE); } - g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; // this is normally set in the update logic, but not on initial config + g_pserver->repl_backlog_size = g_pserver->repl_backlog_config_size; // this is normally set in the update logic, but not on initial config } int iAmMaster(void) { @@ -7539,6 +7649,29 @@ int main(int argc, char **argv) { validateConfiguration(); + if (!g_pserver->sentinel_mode) { + #ifdef __linux__ + linuxMemoryWarnings(); + #if defined (__arm64__) + int ret; + if ((ret = linuxMadvFreeForkBugCheck())) { + if (ret == 1) + serverLog(LL_WARNING,"WARNING Your kernel has a bug that could lead to data corruption during background save. " + "Please upgrade to the latest stable kernel."); + else + serverLog(LL_WARNING, "Failed to test the kernel for a bug that could lead to data corruption during background save. " + "Your system could be affected, please report this error."); + if (!checkIgnoreWarning("ARM64-COW-BUG")) { + serverLog(LL_WARNING,"KeyDB will now exit to prevent data corruption. " + "Note that it is possible to suppress this warning by setting the following config: ignore-warnings ARM64-COW-BUG"); + exit(1); + } + } + #endif /* __arm64__ */ + #endif /* __linux__ */ + } + + const char *err; if (!initializeStorageProvider(&err)) { @@ -7564,25 +7697,6 @@ int main(int argc, char **argv) { if (!g_pserver->sentinel_mode) { /* Things not needed when running in Sentinel mode. */ serverLog(LL_WARNING,"Server initialized"); - #ifdef __linux__ - linuxMemoryWarnings(); - #if defined (__arm64__) - int ret; - if ((ret = linuxMadvFreeForkBugCheck())) { - if (ret == 1) - serverLog(LL_WARNING,"WARNING Your kernel has a bug that could lead to data corruption during background save. " - "Please upgrade to the latest stable kernel."); - else - serverLog(LL_WARNING, "Failed to test the kernel for a bug that could lead to data corruption during background save. " - "Your system could be affected, please report this error."); - if (!checkIgnoreWarning("ARM64-COW-BUG")) { - serverLog(LL_WARNING,"KeyDB will now exit to prevent data corruption. " - "Note that it is possible to suppress this warning by setting the following config: ignore-warnings ARM64-COW-BUG"); - exit(1); - } - } - #endif /* __arm64__ */ - #endif /* __linux__ */ moduleInitModulesSystemLast(); moduleLoadFromQueue(); ACLLoadUsersAtStartup(); diff --git a/src/server.h b/src/server.h index 2a56255ec..9ad1aab8d 100644 --- a/src/server.h +++ b/src/server.h @@ -122,6 +122,9 @@ typedef long long ustime_t; /* microsecond time type. */ #define LOADING_BOOT 1 #define LOADING_REPLICATION 2 +#define OVERLOAD_PROTECT_PERIOD_MS 10'000 // 10 seconds +#define MAX_CLIENTS_SHED_PER_PERIOD (OVERLOAD_PROTECT_PERIOD_MS / 10) // Restrict to one client per 10ms + extern int g_fTestMode; extern struct redisServer *g_pserver; @@ -508,6 +511,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CLIENT_PREVENT_AOF_PROP (1<<19) /* Don't propagate to AOF. */ #define CLIENT_PREVENT_REPL_PROP (1<<20) /* Don't propagate to slaves. */ #define CLIENT_PREVENT_PROP (CLIENT_PREVENT_AOF_PROP|CLIENT_PREVENT_REPL_PROP) +#define CLIENT_IGNORE_SOFT_SHUTDOWN (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_BLOCKED | CLIENT_MONITOR) #define CLIENT_PENDING_WRITE (1<<21) /* Client has output to send but a write handler is yet not installed. */ #define CLIENT_REPLY_OFF (1<<22) /* Don't send replies to client. */ @@ -541,6 +545,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CLIENT_REPL_RDBONLY (1ULL<<42) /* This client is a replica that only wants RDB without replication buffer. */ #define CLIENT_FORCE_REPLY (1ULL<<44) /* Should addReply be forced to write the text? */ +#define CLIENT_AUDIT_LOGGING (1ULL<<45) /* Client commands required audit logging */ /* Client block type (btype field in client structure) * if CLIENT_BLOCKED flag is set. */ @@ -1203,12 +1208,13 @@ class redisDbPersistentData void disableKeyCache(); bool keycacheIsEnabled(); - bool prefetchKeysAsync(client *c, struct parsed_command &command, bool fExecOK); + void prefetchKeysAsync(client *c, struct parsed_command &command); bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; } std::unique_ptr CloneStorageCache() { return std::unique_ptr(m_spstorage->clone()); } - void bulkStorageInsert(char **rgKeys, size_t *rgcbKeys, char **rgVals, size_t *rgcbVals, size_t celem); + std::shared_ptr getStorageCache() { return m_spstorage; } + void bulkDirectStorageInsert(char **rgKeys, size_t *rgcbKeys, char **rgVals, size_t *rgcbVals, size_t celem); dict_iter find_cached_threadsafe(const char *key) const; @@ -1369,7 +1375,8 @@ struct redisDb : public redisDbPersistentDataSnapshot using redisDbPersistentData::FRehashing; using redisDbPersistentData::FTrackingChanges; using redisDbPersistentData::CloneStorageCache; - using redisDbPersistentData::bulkStorageInsert; + using redisDbPersistentData::getStorageCache; + using redisDbPersistentData::bulkDirectStorageInsert; public: const redisDbPersistentDataSnapshot *createSnapshot(uint64_t mvccCheckpoint, bool fOptional) { @@ -1707,6 +1714,7 @@ struct client { size_t argv_len_sum() const; bool asyncCommand(std::function &)> &&mainFn, std::function &&postFn = nullptr); + char* fprint; }; struct saveparam { @@ -1908,6 +1916,12 @@ struct MasterSaveInfo { masterhost = sdsstring(sdsdup(mi.masterhost)); masterport = mi.masterport; } + MasterSaveInfo(const MasterSaveInfo &other) { + masterhost = other.masterhost; + masterport = other.masterport; + memcpy(master_replid, other.master_replid, sizeof(master_replid)); + master_initial_offset = other.master_initial_offset; + } MasterSaveInfo &operator=(const MasterSaveInfo &other) { masterhost = other.masterhost; @@ -2000,6 +2014,8 @@ struct malloc_stats { size_t allocator_allocated; size_t allocator_active; size_t allocator_resident; + size_t sys_total; + size_t sys_available; }; typedef struct socketFds { @@ -2190,6 +2206,9 @@ struct redisServerThreadVars { bool modulesEnabledThisAeLoop = false; /* In this loop of aeMain, were modules enabled before the thread went to sleep? */ bool disable_async_commands = false; /* this is only valid for one cycle of the AE loop and is reset in afterSleep */ + + int propagate_in_transaction = 0; /* Make sure we don't propagate nested MULTI/EXEC */ + int client_pause_in_transaction = 0; /* Was a client pause executed during this Exec? */ std::vector vecclientsProcess; dictAsyncRehashCtl *rehashCtl = nullptr; @@ -2295,9 +2314,7 @@ struct redisServer { int sentinel_mode; /* True if this instance is a Sentinel. */ size_t initial_memory_usage; /* Bytes used after initialization. */ int always_show_logo; /* Show logo even for non-stdout logging. */ - int propagate_in_transaction; /* Make sure we don't propagate nested MULTI/EXEC */ char *ignore_warnings; /* Config: warnings that should be ignored. */ - int client_pause_in_transaction; /* Was a client pause executed during this Exec? */ pause_type client_pause_type; /* True if clients are currently paused */ /* Modules */ ::dict *moduleapi; /* Exported core APIs dictionary for modules. */ @@ -2463,7 +2480,8 @@ struct redisServer { time_t lastbgsave_try; /* Unix time of last attempted bgsave */ time_t rdb_save_time_last; /* Time used by last RDB save run. */ time_t rdb_save_time_start; /* Current RDB save start time. */ - pid_t rdb_child_pid = -1; /* Used only during fork bgsave */ + mstime_t rdb_save_latency; /* Used to track end to end latency of rdb save*/ + pid_t rdb_child_pid = -1; /* Used only during fork bgsave */ int rdb_bgsave_scheduled; /* BGSAVE when possible if true. */ int rdb_child_type; /* Type of save by active child. */ int lastbgsave_status; /* C_OK or C_ERR */ @@ -2554,11 +2572,13 @@ struct redisServer { int get_ack_from_slaves; /* If true we send REPLCONF GETACK. */ /* Limits */ unsigned int maxclients; /* Max number of simultaneous clients */ + unsigned int maxclientsReserved; /* Reserved amount for health checks (localhost conns) */ unsigned long long maxmemory; /* Max number of memory bytes to use */ unsigned long long maxstorage; /* Max number of bytes to use in a storage provider */ int maxmemory_policy; /* Policy for key eviction */ int maxmemory_samples; /* Precision of random sampling */ int maxmemory_eviction_tenacity;/* Aggressiveness of eviction processing */ + int force_eviction_percent; /* Force eviction when this percent of system memory is remaining */ int lfu_log_factor; /* LFU logarithmic counter factor. */ int lfu_decay_time; /* LFU counter decay factor. */ long long proto_max_bulk_len; /* Protocol bulk length maximum size. */ @@ -2693,6 +2713,7 @@ struct redisServer { int tls_auth_clients; int tls_rotation; + std::set tls_auditlog_blocklist; /* Certificates that can be excluded from audit logging */ std::set tls_allowlist; redisTLSContextConfig tls_ctx_config; @@ -2721,11 +2742,24 @@ struct redisServer { long long repl_batch_offStart = -1; long long repl_batch_idxStart = -1; + long long rand_total_threshold; + + int config_soft_shutdown = false; + bool soft_shutdown = false; + + int flash_disable_key_cache = false; + /* Lock Contention Ring Buffer */ static const size_t s_lockContentionSamples = 64; uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; + + sds sdsAvailabilityZone; + int overload_protect_threshold = 0; + int is_overloaded = 0; + int overload_closed_clients = 0; + int module_blocked_pipe[2]; /* Pipe used to awake the event loop if a client blocked on a module command needs to be processed. */ @@ -2840,6 +2874,12 @@ typedef struct { #define OBJ_HASH_KEY 1 #define OBJ_HASH_VALUE 2 +/* Used in evict.cpp */ +enum class EvictReason { + User, /* User memory exceeded limit */ + System /* System memory exceeded limit */ +}; + /*----------------------------------------------------------------------------- * Extern declarations *----------------------------------------------------------------------------*/ @@ -3244,6 +3284,7 @@ void receiveChildInfo(void); void executeWithoutGlobalLock(std::function func); int redisFork(int type); int hasActiveChildProcess(); +int hasActiveChildProcessOrBGSave(); void resetChildState(); int isMutuallyExclusiveChildType(int type); @@ -3343,7 +3384,7 @@ int zslLexValueGteMin(sds value, zlexrangespec *spec); int zslLexValueLteMax(sds value, zlexrangespec *spec); /* Core functions */ -int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level, bool fQuickCycle = false, bool fPreSnapshot=false); +int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level, EvictReason *reason=nullptr, bool fQuickCycle=false, bool fPreSnapshot=false); size_t freeMemoryGetNotCountedMemory(); int overMaxmemoryAfterAlloc(size_t moremem); int processCommand(client *c, int callFlags); @@ -3628,6 +3669,9 @@ unsigned long LFUDecrAndReturn(robj_roptr o); #define EVICT_FAIL 2 int performEvictions(bool fPreSnapshot); +/* meminfo.cpp -- get memory info from /proc/memoryinfo for linux distros */ +size_t getMemAvailable(); +size_t getMemTotal(); /* Keys hashing / comparison functions for dict.c hash tables. */ uint64_t dictSdsHash(const void *key); @@ -3882,13 +3926,13 @@ void incrementMvccTstamp(); #if __GNUC__ >= 7 && !defined(NO_DEPRECATE_FREE) [[deprecated]] -void *calloc(size_t count, size_t size); +void *calloc(size_t count, size_t size) noexcept; [[deprecated]] -void free(void *ptr); +void free(void *ptr) noexcept; [[deprecated]] -void *malloc(size_t size); +void *malloc(size_t size) noexcept; [[deprecated]] -void *realloc(void *ptr, size_t size); +void *realloc(void *ptr, size_t size) noexcept; #endif /* Debugging stuff */ diff --git a/src/sort.cpp b/src/sort.cpp index e31e0039c..5a074d017 100644 --- a/src/sort.cpp +++ b/src/sort.cpp @@ -194,8 +194,8 @@ void sortCommand(client *c) { list *operations; unsigned int outputlen = 0; int desc = 0, alpha = 0; - long limit_start = 0, limit_count = -1, start, end; - int j, dontsort = 0, vectorlen; + long limit_start = 0, limit_count = -1, start, end, vectorlen; + int j, dontsort = 0; int getop = 0; /* GET operation counter */ int int_conversion_error = 0; int syntax_error = 0; @@ -321,8 +321,10 @@ void sortCommand(client *c) { default: vectorlen = 0; serverPanic("Bad SORT type"); /* Avoid GCC warning */ } - /* Perform LIMIT start,count sanity checking. */ - start = (limit_start < 0) ? 0 : limit_start; + /* Perform LIMIT start,count sanity checking. + * And avoid integer overflow by limiting inputs to object sizes. */ + start = std::min(std::max(limit_start, (long)0), vectorlen); + limit_count = std::min(std::max(limit_count, (long)-1), vectorlen); end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1; if (start >= vectorlen) { start = vectorlen-1; diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp new file mode 100644 index 000000000..76eaa133a --- /dev/null +++ b/src/storage/rocksdb.cpp @@ -0,0 +1,279 @@ +#include "rocksdb.h" +#include +#include +#include +#include +#include "../server.h" +#include "../cluster.h" +#include "rocksdbfactor_internal.h" + +static const char keyprefix[] = INTERNAL_KEY_PREFIX; + +rocksdb::Options DefaultRocksDBOptions(); +extern "C" pid_t gettid(); + +bool FInternalKey(const char *key, size_t cch) +{ + if (cch >= sizeof(INTERNAL_KEY_PREFIX)) + { + if (memcmp(key, keyprefix, sizeof(INTERNAL_KEY_PREFIX)-1) == 0) + return true; + } + return false; +} + +std::string getPrefix(unsigned int hashslot) +{ + char *hash_char = (char *)&hashslot; + return std::string(hash_char + (sizeof(unsigned int) - 2), 2); +} + +std::string prefixKey(const char *key, size_t cchKey) +{ + return FInternalKey(key, cchKey) ? std::string(key, cchKey) : getPrefix(keyHashSlot(key, cchKey)) + std::string(key, cchKey); +} + +RocksDBStorageProvider::RocksDBStorageProvider(RocksDBStorageFactory *pfactory, std::shared_ptr &spdb, std::shared_ptr &spcolfam, const rocksdb::Snapshot *psnapshot, size_t count) + : m_pfactory(pfactory), m_spdb(spdb), m_psnapshot(psnapshot), m_spcolfamily(spcolfam), m_count(count) +{ + m_readOptionsTemplate = rocksdb::ReadOptions(); + m_readOptionsTemplate.verify_checksums = false; + m_readOptionsTemplate.snapshot = m_psnapshot; +} + +void RocksDBStorageProvider::insert(const char *key, size_t cchKey, void *data, size_t cb, bool fOverwrite) +{ + rocksdb::Status status; + std::unique_lock l(m_lock); + std::string prefixed_key = prefixKey(key, cchKey); + if (m_spbatch != nullptr) + status = m_spbatch->Put(m_spcolfamily.get(), rocksdb::Slice(prefixed_key), rocksdb::Slice((const char*)data, cb)); + else + status = m_spdb->Put(WriteOptions(), m_spcolfamily.get(), rocksdb::Slice(prefixed_key), rocksdb::Slice((const char*)data, cb)); + if (!status.ok()) + throw status.ToString(); + + if (!fOverwrite) + ++m_count; +} + +void RocksDBStorageProvider::bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) +{ + if (celem >= 16384) { + rocksdb::Options options = DefaultRocksDBOptions(); + rocksdb::SstFileWriter sst_file_writer(rocksdb::EnvOptions(), options, options.comparator); + std::string file_path = m_pfactory->getTempFolder() + "/tmpIngest."; + file_path += std::to_string(gettid()); + file_path += ".sst"; + + rocksdb::Status s = sst_file_writer.Open(file_path); + if (!s.ok()) + goto LFallback; + + // Insert rows into the SST file, note that inserted keys must be + // strictly increasing (based on options.comparator) + for (size_t ielem = 0; ielem < celem; ++ielem) { + std::string prefixed_key = prefixKey(rgkeys[ielem], rgcbkeys[ielem]); + s = sst_file_writer.Put(rocksdb::Slice(prefixed_key), rocksdb::Slice(rgvals[ielem], rgcbvals[ielem])); + if (!s.ok()) { + unlink(file_path.c_str()); + goto LFallback; + } + } + + s = sst_file_writer.Finish(); + if (!s.ok()) { + unlink(file_path.c_str()); + goto LFallback; + } + + auto ingestOptions = rocksdb::IngestExternalFileOptions(); + ingestOptions.move_files = true; + ingestOptions.write_global_seqno = false; + ingestOptions.failed_move_fall_back_to_copy = false; + + // Ingest the external SST file into the DB + s = m_spdb->IngestExternalFile(m_spcolfamily.get(), {file_path}, ingestOptions); + if (!s.ok()) { + unlink(file_path.c_str()); + goto LFallback; + } + } else { + LFallback: + auto spbatch = std::make_unique(); + for (size_t ielem = 0; ielem < celem; ++ielem) { + std::string prefixed_key = prefixKey(rgkeys[ielem], rgcbkeys[ielem]); + spbatch->Put(m_spcolfamily.get(), rocksdb::Slice(prefixed_key), rocksdb::Slice(rgvals[ielem], rgcbvals[ielem])); + } + m_spdb->Write(WriteOptions(), spbatch.get()); + } + + std::unique_lock l(m_lock); + m_count += celem; +} + +bool RocksDBStorageProvider::erase(const char *key, size_t cchKey) +{ + rocksdb::Status status; + std::unique_lock l(m_lock); + std::string prefixed_key = prefixKey(key, cchKey); + if (!FKeyExists(prefixed_key)) + return false; + if (m_spbatch != nullptr) + { + status = m_spbatch->Delete(m_spcolfamily.get(), rocksdb::Slice(prefixed_key)); + } + else + { + status = m_spdb->Delete(WriteOptions(), m_spcolfamily.get(), rocksdb::Slice(prefixed_key)); + } + if (status.ok()) + --m_count; + return status.ok(); +} + +void RocksDBStorageProvider::retrieve(const char *key, size_t cchKey, callbackSingle fn) const +{ + rocksdb::PinnableSlice slice; + std::string prefixed_key = prefixKey(key, cchKey); + auto status = m_spdb->Get(ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(prefixed_key), &slice); + if (status.ok()) + fn(key, cchKey, slice.data(), slice.size()); +} + +size_t RocksDBStorageProvider::clear() +{ + size_t celem = count(); + auto status = m_spdb->DropColumnFamily(m_spcolfamily.get()); + auto strName = m_spcolfamily->GetName(); + + rocksdb::ColumnFamilyHandle *handle = nullptr; + rocksdb::ColumnFamilyOptions cf_options(m_pfactory->RocksDbOptions()); + m_spdb->CreateColumnFamily(cf_options, strName, &handle); + m_spcolfamily = std::shared_ptr(handle); + + if (!status.ok()) + throw status.ToString(); + m_count = 0; + return celem; +} + +size_t RocksDBStorageProvider::count() const +{ + std::unique_lock l(m_lock); + return m_count; +} + +bool RocksDBStorageProvider::enumerate(callback fn) const +{ + std::unique_ptr it = std::unique_ptr(m_spdb->NewIterator(ReadOptions(), m_spcolfamily.get())); + size_t count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + if (FInternalKey(it->key().data(), it->key().size())) + continue; + ++count; + bool fContinue = fn(it->key().data()+2, it->key().size()-2, it->value().data(), it->value().size()); + if (!fContinue) + break; + } + if (!it->Valid() && count != m_count) + { + if (const_cast(this)->m_count != count) + printf("WARNING: rocksdb count mismatch"); + const_cast(this)->m_count = count; + } + assert(it->status().ok()); // Check for any errors found during the scan + return !it->Valid(); +} + +bool RocksDBStorageProvider::enumerate_hashslot(callback fn, unsigned int hashslot) const +{ + std::string prefix = getPrefix(hashslot); + std::unique_ptr it = std::unique_ptr(m_spdb->NewIterator(ReadOptions(), m_spcolfamily.get())); + size_t count = 0; + for (it->Seek(prefix.c_str()); it->Valid(); it->Next()) { + if (FInternalKey(it->key().data(), it->key().size())) + continue; + if (strncmp(it->key().data(),prefix.c_str(),2) != 0) + break; + ++count; + bool fContinue = fn(it->key().data()+2, it->key().size()-2, it->value().data(), it->value().size()); + if (!fContinue) + break; + } + bool full_iter = !it->Valid() || (strncmp(it->key().data(),prefix.c_str(),2) != 0); + if (full_iter && count != g_pserver->cluster->slots_keys_count[hashslot]) + { + printf("WARNING: rocksdb hashslot count mismatch"); + } + assert(!full_iter || count == g_pserver->cluster->slots_keys_count[hashslot]); + assert(it->status().ok()); // Check for any errors found during the scan + return full_iter; +} + +const IStorage *RocksDBStorageProvider::clone() const +{ + std::unique_lock l(m_lock); + const rocksdb::Snapshot *psnapshot = const_cast(this)->m_spdb->GetSnapshot(); + return new RocksDBStorageProvider(m_pfactory, const_cast(this)->m_spdb, const_cast(this)->m_spcolfamily, psnapshot, m_count); +} + +RocksDBStorageProvider::~RocksDBStorageProvider() +{ + if (m_spbatch != nullptr) + endWriteBatch(); + + if (m_spdb != nullptr && m_psnapshot == nullptr) + { + insert(count_key, sizeof(count_key), &m_count, sizeof(m_count), false); + flush(); + } + + if (m_spdb != nullptr) + { + if (m_psnapshot != nullptr) + m_spdb->ReleaseSnapshot(m_psnapshot); + } +} + +rocksdb::WriteOptions RocksDBStorageProvider::WriteOptions() const +{ + auto opt = rocksdb::WriteOptions(); + return opt; +} + +void RocksDBStorageProvider::beginWriteBatch() +{ + m_lock.lock(); + m_spbatch = std::make_unique(); +} + +void RocksDBStorageProvider::endWriteBatch() +{ + m_spdb->Write(WriteOptions(), m_spbatch.get()->GetWriteBatch()); + m_spbatch = nullptr; + m_lock.unlock(); +} + +void RocksDBStorageProvider::batch_lock() +{ + m_lock.lock(); +} + +void RocksDBStorageProvider::batch_unlock() +{ + m_lock.unlock(); +} + +void RocksDBStorageProvider::flush() +{ + m_spdb->SyncWAL(); +} + +bool RocksDBStorageProvider::FKeyExists(std::string& key) const +{ + rocksdb::PinnableSlice slice; + if (m_spbatch) + return m_spbatch->GetFromBatchAndDB(m_spdb.get(), ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(key), &slice).ok(); + return m_spdb->Get(ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(key), &slice).ok(); +} \ No newline at end of file diff --git a/src/storage/rocksdb.h b/src/storage/rocksdb.h new file mode 100644 index 000000000..b78788eb2 --- /dev/null +++ b/src/storage/rocksdb.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include "../IStorage.h" +#include +#include +#include "../fastlock.h" + +#define INTERNAL_KEY_PREFIX "\x00\x04\x03\x00\x05\x02\x04" +static const char count_key[] = INTERNAL_KEY_PREFIX "__keydb__count\1"; +static const char version_key[] = INTERNAL_KEY_PREFIX "__keydb__version\1"; +static const char meta_key[] = INTERNAL_KEY_PREFIX "__keydb__metadata\1"; +class RocksDBStorageFactory; + +class RocksDBStorageProvider : public IStorage +{ + RocksDBStorageFactory *m_pfactory; + std::shared_ptr m_spdb; // Note: This must be first so it is deleted last + std::unique_ptr m_spbatch; + const rocksdb::Snapshot *m_psnapshot = nullptr; + std::shared_ptr m_spcolfamily; + rocksdb::ReadOptions m_readOptionsTemplate; + size_t m_count = 0; + mutable fastlock m_lock {"RocksDBStorageProvider"}; + +public: + RocksDBStorageProvider(RocksDBStorageFactory *pfactory, std::shared_ptr &spdb, std::shared_ptr &spcolfam, const rocksdb::Snapshot *psnapshot, size_t count); + ~RocksDBStorageProvider(); + + virtual void insert(const char *key, size_t cchKey, void *data, size_t cb, bool fOverwrite) override; + virtual bool erase(const char *key, size_t cchKey) override; + virtual void retrieve(const char *key, size_t cchKey, callbackSingle fn) const override; + virtual size_t clear() override; + virtual bool enumerate(callback fn) const override; + virtual bool enumerate_hashslot(callback fn, unsigned int hashslot) const override; + + virtual const IStorage *clone() const override; + + virtual void beginWriteBatch() override; + virtual void endWriteBatch() override; + + virtual void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) override; + + virtual void batch_lock() override; + virtual void batch_unlock() override; + + virtual void flush() override; + + size_t count() const override; + +protected: + bool FKeyExists(std::string&) const; + + const rocksdb::ReadOptions &ReadOptions() const { return m_readOptionsTemplate; } + rocksdb::WriteOptions WriteOptions() const; +}; + +bool FInternalKey(const char *key, size_t cch); \ No newline at end of file diff --git a/src/storage/rocksdbfactor_internal.h b/src/storage/rocksdbfactor_internal.h index 11862bb7b..dc27f6987 100644 --- a/src/storage/rocksdbfactor_internal.h +++ b/src/storage/rocksdbfactor_internal.h @@ -18,6 +18,7 @@ class RocksDBStorageFactory : public IStorageFactory virtual const char *name() const override; virtual size_t totalDiskspaceUsed() const override; + virtual sdsstring getInfo() const override; virtual bool FSlow() const override { return true; } diff --git a/src/storage/rocksdbfactory.cpp b/src/storage/rocksdbfactory.cpp new file mode 100644 index 000000000..5c3beeb4b --- /dev/null +++ b/src/storage/rocksdbfactory.cpp @@ -0,0 +1,224 @@ +#include "rocksdb.h" +#include "../version.h" +#include +#include +#include +#include +#include +#include +#include "rocksdbfactor_internal.h" +#include +#include +#include + +rocksdb::Options DefaultRocksDBOptions() { + rocksdb::Options options; + options.max_background_compactions = 4; + options.max_background_flushes = 2; + options.bytes_per_sync = 1048576; + options.compaction_pri = rocksdb::kMinOverlappingRatio; + options.compression = rocksdb::kNoCompression; + options.enable_pipelined_write = true; + options.allow_mmap_reads = true; + options.avoid_unnecessary_blocking_io = true; + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(0)); + + rocksdb::BlockBasedTableOptions table_options; + table_options.block_size = 16 * 1024; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.data_block_index_type = rocksdb::BlockBasedTableOptions::kDataBlockBinaryAndHash; + table_options.checksum = rocksdb::kNoChecksum; + table_options.format_version = 4; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + return options; +} + +IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig) +{ + return new RocksDBStorageFactory(path, dbnum, rgchConfig, cchConfig); +} + +rocksdb::Options RocksDBStorageFactory::RocksDbOptions() +{ + rocksdb::Options options = DefaultRocksDBOptions(); + options.max_open_files = filedsRequired(); + options.sst_file_manager = m_pfilemanager; + options.create_if_missing = true; + options.create_missing_column_families = true; + options.info_log_level = rocksdb::ERROR_LEVEL; + options.max_total_wal_size = 1 * 1024 * 1024 * 1024; + return options; +} + +RocksDBStorageFactory::RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig) + : m_path(dbfile) +{ + dbnum++; // create an extra db for metadata + // Get the count of column families in the actual database + std::vector vecT; + auto status = rocksdb::DB::ListColumnFamilies(rocksdb::Options(), dbfile, &vecT); + // RocksDB requires we know the count of col families before opening, if the user only wants to see less + // we still have to make room for all column family handles regardless + if (status.ok() && (int)vecT.size() > dbnum) + dbnum = (int)vecT.size(); + + std::vector veccoldesc; + veccoldesc.push_back(rocksdb::ColumnFamilyDescriptor(rocksdb::kDefaultColumnFamilyName, rocksdb::ColumnFamilyOptions())); // ignore default col family + + m_pfilemanager = std::shared_ptr(rocksdb::NewSstFileManager(rocksdb::Env::Default())); + + rocksdb::DB *db = nullptr; + + auto options = RocksDbOptions(); + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(2)); + + for (int idb = 0; idb < dbnum; ++idb) + { + rocksdb::ColumnFamilyOptions cf_options(options); + cf_options.level_compaction_dynamic_level_bytes = true; + veccoldesc.push_back(rocksdb::ColumnFamilyDescriptor(std::to_string(idb), cf_options)); + } + + if (rgchConfig != nullptr) + { + std::string options_string(rgchConfig, cchConfig); + rocksdb::Status status; + if (!(status = rocksdb::GetDBOptionsFromString(options, options_string, &options)).ok()) + { + fprintf(stderr, "Failed to parse FLASH options: %s\r\n", status.ToString().c_str()); + exit(EXIT_FAILURE); + } + } + + std::vector handles; + status = rocksdb::DB::Open(options, dbfile, veccoldesc, &handles, &db); + if (!status.ok()) + throw status.ToString(); + + m_spdb = std::shared_ptr(db); + for (auto handle : handles) + { + std::string strVersion; + auto status = m_spdb->Get(rocksdb::ReadOptions(), handle, rocksdb::Slice(version_key, sizeof(version_key)), &strVersion); + if (!status.ok()) + { + setVersion(handle); + } + else + { + SymVer ver = parseVersion(strVersion.c_str()); + auto cmp = compareVersion(&ver); + if (cmp == NewerVersion) + throw "Cannot load FLASH database created by newer version of KeyDB"; + if (cmp == OlderVersion) + setVersion(handle); + } + m_vecspcols.emplace_back(handle); + } +} + +RocksDBStorageFactory::~RocksDBStorageFactory() +{ + m_spdb->SyncWAL(); +} + +void RocksDBStorageFactory::setVersion(rocksdb::ColumnFamilyHandle *handle) +{ + auto status = m_spdb->Put(rocksdb::WriteOptions(), handle, rocksdb::Slice(version_key, sizeof(version_key)), rocksdb::Slice(KEYDB_REAL_VERSION, strlen(KEYDB_REAL_VERSION)+1)); + if (!status.ok()) + throw status.ToString(); +} + +size_t RocksDBStorageFactory::filedsRequired() const { + return 256; +} + +std::string RocksDBStorageFactory::getTempFolder() +{ + auto path = m_path + "/keydb_tmp/"; + if (!m_fCreatedTempFolder) { + if (!mkdir(path.c_str(), 0700)) + m_fCreatedTempFolder = true; + } + return path; +} + +IStorage *RocksDBStorageFactory::createMetadataDb() +{ + IStorage *metadataDb = this->create(-1, nullptr, nullptr); + metadataDb->insert(meta_key, sizeof(meta_key), (void*)METADATA_DB_IDENTIFIER, strlen(METADATA_DB_IDENTIFIER), false); + return metadataDb; +} + +IStorage *RocksDBStorageFactory::create(int db, key_load_iterator iter, void *privdata) +{ + ++db; // skip default col family + std::shared_ptr spcolfamily(m_vecspcols[db].release()); + size_t count = 0; + bool fUnclean = false; + + std::string value; + auto status = m_spdb->Get(rocksdb::ReadOptions(), spcolfamily.get(), rocksdb::Slice(count_key, sizeof(count_key)), &value); + if (status.ok() && value.size() == sizeof(size_t)) + { + count = *reinterpret_cast(value.data()); + m_spdb->Delete(rocksdb::WriteOptions(), spcolfamily.get(), rocksdb::Slice(count_key, sizeof(count_key))); + } + else + { + fUnclean = true; + } + + if (fUnclean || iter != nullptr) + { + count = 0; + auto opts = rocksdb::ReadOptions(); + opts.tailing = true; + std::unique_ptr it = std::unique_ptr(m_spdb->NewIterator(opts, spcolfamily.get())); + + it->SeekToFirst(); + bool fFirstRealKey = true; + + for (;it->Valid(); it->Next()) { + if (FInternalKey(it->key().data(), it->key().size())) + continue; + if (fUnclean && it->Valid() && fFirstRealKey) + printf("\tDatabase %d was not shutdown cleanly, recomputing metrics\n", db); + fFirstRealKey = false; + if (iter != nullptr) + iter(it->key().data()+2, it->key().size()-2, privdata); + ++count; + } + } + return new RocksDBStorageProvider(this, m_spdb, spcolfamily, nullptr, count); +} + +const char *RocksDBStorageFactory::name() const +{ + return "flash"; +} + +size_t RocksDBStorageFactory::totalDiskspaceUsed() const +{ + return m_pfilemanager->GetTotalSize(); +} + +sdsstring RocksDBStorageFactory::getInfo() const +{ + struct statvfs fiData; + int status = statvfs(m_path.c_str(), &fiData); + if ( status == 0 ) { + return sdsstring(sdscatprintf(sdsempty(), + "storage_flash_used_bytes:%zu\r\n" + "storage_flash_total_bytes:%zu\r\n" + "storage_flash_rocksdb_bytes:%zu\r\n", + fiData.f_bfree * fiData.f_frsize, + fiData.f_blocks * fiData.f_frsize, + totalDiskspaceUsed())); + } else { + fprintf(stderr, "Failed to gather FLASH statistics with status: %d\r\n", status); + return sdsstring(sdsempty()); + } +} diff --git a/src/storage/teststorageprovider.cpp b/src/storage/teststorageprovider.cpp index a287397c7..b15924fc4 100644 --- a/src/storage/teststorageprovider.cpp +++ b/src/storage/teststorageprovider.cpp @@ -73,7 +73,22 @@ bool TestStorageProvider::enumerate(callback fn) const } return fAll; } - + +bool TestStorageProvider::enumerate_hashslot(callback fn, unsigned int hashslot) const +{ + bool fAll = true; + for (auto &pair : m_map) + { + if (keyHashSlot(pair.first.data(), pair.first.size()) == hashslot) + if (!fn(pair.first.data(), pair.first.size(), pair.second.data(), pair.second.size())) + { + fAll = false; + break; + } + } + return fAll; +} + size_t TestStorageProvider::count() const { return m_map.size(); diff --git a/src/storage/teststorageprovider.h b/src/storage/teststorageprovider.h index f21732256..cb8c384f1 100644 --- a/src/storage/teststorageprovider.h +++ b/src/storage/teststorageprovider.h @@ -8,6 +8,7 @@ class TestStorageFactory : public IStorageFactory virtual class IStorage *createMetadataDb() override; virtual const char *name() const override; virtual size_t totalDiskspaceUsed() const override { return 0; } + virtual sdsstring getInfo() const override { return sdsstring(sdsempty()); } virtual bool FSlow() const override { return false; } }; @@ -24,6 +25,7 @@ class TestStorageProvider final : public IStorage virtual void retrieve(const char *key, size_t cchKey, callbackSingle fn) const override; virtual size_t clear() override; virtual bool enumerate(callback fn) const override; + virtual bool enumerate_hashslot(callback fn, unsigned int hashslot) const override; virtual size_t count() const override; virtual void flush() override; diff --git a/src/t_hash.cpp b/src/t_hash.cpp index 9eddbb449..e2d48d91d 100644 --- a/src/t_hash.cpp +++ b/src/t_hash.cpp @@ -729,6 +729,10 @@ void hincrbyfloatCommand(client *c) { unsigned int vlen; if (getLongDoubleFromObjectOrReply(c,c->argv[3],&incr,NULL) != C_OK) return; + if (isnan(incr) || isinf(incr)) { + addReplyError(c,"value is NaN or Infinity"); + return; + } if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; if (hashTypeGetValue(o,szFromObj(c->argv[2]),&vstr,&vlen,&ll) == C_OK) { if (vstr) { @@ -1062,6 +1066,8 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { addReplyBulkCBuffer(c, key, sdslen(key)); if (withvalues) addReplyBulkCBuffer(c, value, sdslen(value)); + if (c->flags & CLIENT_CLOSE_ASAP) + break; } } else if (hash->encoding == OBJ_ENCODING_ZIPLIST) { ziplistEntry *keys, *vals = NULL; @@ -1075,6 +1081,8 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { count -= sample_count; ziplistRandomPairs((unsigned char*)ptrFromObj(hash), sample_count, keys, vals); harndfieldReplyWithZiplist(c, sample_count, keys, vals); + if (c->flags & CLIENT_CLOSE_ASAP) + break; } zfree(keys); zfree(vals); @@ -1221,12 +1229,17 @@ void hrandfieldCommand(client *c) { ziplistEntry ele; if (c->argc >= 3) { - if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != C_OK) return; + if (getRangeLongFromObjectOrReply(c,c->argv[2],-LONG_MAX,LONG_MAX,&l,NULL) != C_OK) return; if (c->argc > 4 || (c->argc == 4 && strcasecmp(szFromObj(c->argv[3]),"withvalues"))) { addReplyErrorObject(c,shared.syntaxerr); return; - } else if (c->argc == 4) + } else if (c->argc == 4) { withvalues = 1; + if (l < -g_pserver->rand_total_threshold || l > g_pserver->rand_total_threshold) { + addReplyError(c,"value is out of range"); + return; + } + } hrandfieldWithCountCommand(c, l, withvalues); return; } diff --git a/src/t_nhash.cpp b/src/t_nhash.cpp index 9cf4d81ea..b0d4b9dcf 100644 --- a/src/t_nhash.cpp +++ b/src/t_nhash.cpp @@ -263,7 +263,7 @@ sds writeJsonValue(sds output, const char *valIn, size_t cchIn) { serverAssert(!FSimpleJsonEscapeCh(valIn[ich])); if (FExtendedJsonEscapeCh(valIn[ich])) { dst[ichDst++] = '\\'; dst[ichDst++] = 'u'; - sprintf(dst + ichDst, "%4x", valIn[ich]); + snprintf(dst + ichDst, cchIn+cchEscapeExtra-ichDst, "%4x", valIn[ich]); ichDst += 4; } else { dst[ichDst++] = valIn[ich]; diff --git a/src/t_set.cpp b/src/t_set.cpp index 23b161c10..36342199c 100644 --- a/src/t_set.cpp +++ b/src/t_set.cpp @@ -672,7 +672,11 @@ void srandmemberWithCountCommand(client *c) { dict *d; - if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != C_OK) return; + if (getRangeLongFromObjectOrReply(c,c->argv[2],-LONG_MAX,LONG_MAX,&l,NULL) != C_OK) return; + if (l < -g_pserver->rand_total_threshold || l > g_pserver->rand_total_threshold) { + addReplyError(c,"value is out of range"); + return; + } if (l >= 0) { count = (unsigned long) l; } else { @@ -706,6 +710,8 @@ void srandmemberWithCountCommand(client *c) { } else { addReplyBulkCBuffer(c,ele,sdslen(ele)); } + if (c->flags & CLIENT_CLOSE_ASAP) + break; } return; } diff --git a/src/t_stream.cpp b/src/t_stream.cpp index b005cf600..ac895661e 100644 --- a/src/t_stream.cpp +++ b/src/t_stream.cpp @@ -789,13 +789,13 @@ int64_t streamTrim(stream *s, streamAddTrimArgs *args) { * update it after (and if) we actually remove the entry */ unsigned char *pcopy = p; - int flags = lpGetInteger(p); + int64_t flags = lpGetInteger(p); p = lpNext(lp, p); /* Skip flags. */ - int to_skip; + int64_t to_skip; - int ms_delta = lpGetInteger(p); + int64_t ms_delta = lpGetInteger(p); p = lpNext(lp, p); /* Skip ID ms delta */ - int seq_delta = lpGetInteger(p); + int64_t seq_delta = lpGetInteger(p); p = lpNext(lp, p); /* Skip ID seq delta */ streamID currid = {0}; /* For MINID */ @@ -1125,7 +1125,7 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { * the first entry emitted for this listpack, then we already * emitted the current entry, and have to go back to the previous * one. */ - int lp_count = lpGetInteger(si->lp_ele); + int64_t lp_count = lpGetInteger(si->lp_ele); while(lp_count--) si->lp_ele = lpPrev(si->lp,si->lp_ele); /* Seek lp-count of prev entry. */ si->lp_ele = lpPrev(si->lp,si->lp_ele); @@ -1155,7 +1155,7 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { /* Get the flags entry. */ si->lp_flags = si->lp_ele; - int flags = lpGetInteger(si->lp_ele); + int64_t flags = lpGetInteger(si->lp_ele); si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek ID. */ /* Get the ID: it is encoded as difference between the master @@ -1264,7 +1264,7 @@ void streamIteratorRemoveEntry(streamIterator *si, streamID *current) { * deleted entries in the listpack header. * * We start flagging: */ - int flags = lpGetInteger(si->lp_flags); + int64_t flags = lpGetInteger(si->lp_flags); flags |= STREAM_ITEM_FLAG_DELETED; lp = lpReplaceInteger(lp,&si->lp_flags,flags); diff --git a/src/t_string.cpp b/src/t_string.cpp index b9a417061..f946ba3a3 100644 --- a/src/t_string.cpp +++ b/src/t_string.cpp @@ -38,8 +38,14 @@ int getGenericCommand(client *c); * String Commands *----------------------------------------------------------------------------*/ -static int checkStringLength(client *c, long long size) { - if (!(c->flags & CLIENT_MASTER) && size > g_pserver->proto_max_bulk_len) { +static int checkStringLength(client *c, long long size, long long append) { + if (c->flags & CLIENT_MASTER) + return C_OK; + /* 'uint64_t' cast is there just to prevent undefined behavior on overflow */ + long long total = (uint64_t)size + append; + /* Test configured max-bulk-len represending a limit of the biggest string object, + * and also test for overflow. */ + if (total > g_pserver->proto_max_bulk_len || total < size || total < append) { addReplyError(c,"string exceeds maximum allowed size (proto-max-bulk-len)"); return C_ERR; } @@ -445,7 +451,7 @@ void setrangeCommand(client *c) { } /* Return when the resulting string exceeds allowed size */ - if (checkStringLength(c,offset+sdslen(value)) != C_OK) + if (checkStringLength(c,offset,sdslen(value)) != C_OK) return; o = createObject(OBJ_STRING,sdsnewlen(NULL, offset+sdslen(value))); @@ -465,7 +471,7 @@ void setrangeCommand(client *c) { } /* Return when the resulting string exceeds allowed size */ - if (checkStringLength(c,offset+sdslen(value)) != C_OK) + if (checkStringLength(c,offset,sdslen(value)) != C_OK) return; /* Create a copy when the object is shared or encoded. */ @@ -685,8 +691,7 @@ void appendCommand(client *c) { /* "append" is an argument, so always an sds */ append = c->argv[2]; - totlen = stringObjectLen(o)+sdslen((sds)ptrFromObj(append)); - if (checkStringLength(c,totlen) != C_OK) + if (checkStringLength(c,stringObjectLen(o),sdslen((sds)ptrFromObj(append))) != C_OK) return; /* Append the value */ diff --git a/src/t_zset.cpp b/src/t_zset.cpp index ad4ea2bd4..2ee1d3cc9 100644 --- a/src/t_zset.cpp +++ b/src/t_zset.cpp @@ -4054,6 +4054,8 @@ void zrandmemberWithCountCommand(client *c, long l, int withscores) { addReplyBulkCBuffer(c, key, sdslen(key)); if (withscores) addReplyDouble(c, *(double*)dictGetVal(de)); + if (c->flags & CLIENT_CLOSE_ASAP) + break; } } else if (zsetobj->encoding == OBJ_ENCODING_ZIPLIST) { ziplistEntry *keys, *vals = NULL; @@ -4067,6 +4069,8 @@ void zrandmemberWithCountCommand(client *c, long l, int withscores) { count -= sample_count; ziplistRandomPairs((unsigned char*)ptrFromObj(zsetobj), sample_count, keys, vals); zarndmemberReplyWithZiplist(c, sample_count, keys, vals); + if (c->flags & CLIENT_CLOSE_ASAP) + break; } zfree(keys); zfree(vals); @@ -4210,12 +4214,17 @@ void zrandmemberCommand(client *c) { ziplistEntry ele; if (c->argc >= 3) { - if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != C_OK) return; + if (getRangeLongFromObjectOrReply(c,c->argv[2],-LONG_MAX,LONG_MAX,&l,NULL) != C_OK) return; if (c->argc > 4 || (c->argc == 4 && strcasecmp(szFromObj(c->argv[3]),"withscores"))) { addReplyErrorObject(c,shared.syntaxerr); return; - } else if (c->argc == 4) + } else if (c->argc == 4) { withscores = 1; + if (l < -g_pserver->rand_total_threshold || l > g_pserver->rand_total_threshold) { + addReplyError(c,"value is out of range"); + return; + } + } zrandmemberWithCountCommand(c, l, withscores); return; } diff --git a/src/tls.cpp b/src/tls.cpp index 68651bfbb..8a8a97a95 100644 --- a/src/tls.cpp +++ b/src/tls.cpp @@ -46,6 +46,9 @@ #include #include +#if OPENSSL_VERSION_NUMBER >= 0x30000000L +#include +#endif #define REDIS_TLS_PROTO_TLSv1 (1<<0) #define REDIS_TLS_PROTO_TLSv1_1 (1<<1) @@ -154,14 +157,13 @@ void tlsInit(void) { */ #if OPENSSL_VERSION_NUMBER < 0x10100000L OPENSSL_config(NULL); + SSL_load_error_strings(); + SSL_library_init(); #elif OPENSSL_VERSION_NUMBER < 0x10101000L OPENSSL_init_crypto(OPENSSL_INIT_LOAD_CONFIG, NULL); #else OPENSSL_init_crypto(OPENSSL_INIT_LOAD_CONFIG|OPENSSL_INIT_ATFORK, NULL); #endif - ERR_load_crypto_strings(); - SSL_load_error_strings(); - SSL_library_init(); #ifdef USE_CRYPTO_LOCKS initCryptoLocks(); @@ -364,20 +366,46 @@ int tlsConfigure(redisTLSContextConfig *ctx_config) { if (ctx_config->prefer_server_ciphers) SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE); -#if defined(SSL_CTX_set_ecdh_auto) +#if ((OPENSSL_VERSION_NUMBER < 0x30000000L) && defined(SSL_CTX_set_ecdh_auto)) SSL_CTX_set_ecdh_auto(ctx, 1); #endif SSL_CTX_set_options(ctx, SSL_OP_SINGLE_DH_USE); if (ctx_config->dh_params_file) { FILE *dhfile = fopen(ctx_config->dh_params_file, "r"); - DH *dh = NULL; if (!dhfile) { serverLog(LL_WARNING, "Failed to load %s: %s", ctx_config->dh_params_file, strerror(errno)); goto error; } - dh = PEM_read_DHparams(dhfile, NULL, NULL, NULL); +#if (OPENSSL_VERSION_NUMBER >= 0x30000000L) + EVP_PKEY *pkey = NULL; + OSSL_DECODER_CTX *dctx = OSSL_DECODER_CTX_new_for_pkey( + &pkey, "PEM", NULL, "DH", OSSL_KEYMGMT_SELECT_DOMAIN_PARAMETERS, NULL, NULL); + if (!dctx) { + serverLog(LL_WARNING, "No decoder for DH params."); + fclose(dhfile); + goto error; + } + if (!OSSL_DECODER_from_fp(dctx, dhfile)) { + serverLog(LL_WARNING, "%s: failed to read DH params.", ctx_config->dh_params_file); + OSSL_DECODER_CTX_free(dctx); + fclose(dhfile); + goto error; + } + + OSSL_DECODER_CTX_free(dctx); + fclose(dhfile); + + if (SSL_CTX_set0_tmp_dh_pkey(ctx, pkey) <= 0) { + ERR_error_string_n(ERR_get_error(), errbuf, sizeof(errbuf)); + serverLog(LL_WARNING, "Failed to load DH params file: %s: %s", ctx_config->dh_params_file, errbuf); + EVP_PKEY_free(pkey); + goto error; + } + /* Not freeing pkey, it is owned by OpenSSL now */ +#else + DH *dh = PEM_read_DHparams(dhfile, NULL, NULL, NULL); fclose(dhfile); if (!dh) { serverLog(LL_WARNING, "%s: failed to read DH params.", ctx_config->dh_params_file); @@ -392,6 +420,11 @@ int tlsConfigure(redisTLSContextConfig *ctx_config) { } DH_free(dh); +#endif + } else { +#if (OPENSSL_VERSION_NUMBER >= 0x30000000L) + SSL_CTX_set_dh_auto(ctx, 1); +#endif } /* If a client-side certificate is configured, create an explicit client context */ @@ -484,36 +517,75 @@ typedef struct tls_connection { aeEventLoop *el; } tls_connection; -/* Check to see if a given client name matches against our allowlist. +/* Check to see if a given client name is contained in the provided set (allowlist/blocklist) * Return true if it does */ -bool tlsCheckAgainstAllowlist(const char * client){ +bool tlsCheckAgainstAllowlist(const char * client, std::set set){ /* Because of wildcard matching, we need to iterate over the entire set. * If we were doing simply straight matching, we could just directly * check to see if the client name is in the set in O(1) time */ - for (auto &client_pattern: g_pserver->tls_allowlist){ + for (auto &client_pattern: set){ if (stringmatchlen(client_pattern.get(), client_pattern.size(), client, strlen(client), 1)) return true; } return false; } +/* Sets the sha256 certificate fingerprint on the connection + * Based on the example here https://fm4dd.com/openssl/certfprint.shtm */ +void tlsSetCertificateFingerprint(tls_connection* conn, X509 * cert) { + unsigned int fprint_size; + unsigned char fprint[EVP_MAX_MD_SIZE]; + const EVP_MD *fprint_type = EVP_sha256(); + X509_digest(cert, fprint_type, fprint, &fprint_size); + + if (conn->c.fprint) zfree(conn->c.fprint); + conn->c.fprint = (char*)zcalloc(fprint_size*2+1); + + /* Format fingerprint as hex string */ + char tmp[3]; + for (unsigned int i = 0; i < fprint_size; i++) { + snprintf(tmp, 2, "%02x", (unsigned int)fprint[i]); + strncat(conn->c.fprint, tmp, 2); + } +} + /* ASN1_STRING_get0_data was introduced in OPENSSL 1.1.1 * use ASN1_STRING_data for older versions where it is not available */ #if OPENSSL_VERSION_NUMBER < 0x10100000L #define ASN1_STRING_get0_data ASN1_STRING_data #endif -bool tlsValidateCertificateName(tls_connection* conn){ - if (g_pserver->tls_allowlist.empty()) - return true; // Empty list implies acceptance of all +class TCleanup { + std::function fn; + +public: + TCleanup(std::function fn) + : fn(fn) + {} + + ~TCleanup() { + fn(); + } +}; + +bool tlsCheckCertificateAgainstAllowlist(tls_connection* conn, std::set allowlist, const char** commonName){ + if (allowlist.empty()){ + // An empty list implies acceptance of all + return true; + } X509 * cert = SSL_get_peer_certificate(conn->ssl); + TCleanup certClen([cert]{X509_free(cert);}); + /* Check the common name (CN) of the certificate first */ X509_NAME_ENTRY * ne = X509_NAME_get_entry(X509_get_subject_name(cert), X509_NAME_get_index_by_NID(X509_get_subject_name(cert), NID_commonName, -1)); - const char * commonName = reinterpret_cast(ASN1_STRING_get0_data(X509_NAME_ENTRY_get_data(ne))); - - if (tlsCheckAgainstAllowlist(commonName)) + *commonName = reinterpret_cast(ASN1_STRING_get0_data(X509_NAME_ENTRY_get_data(ne))); + + tlsSetCertificateFingerprint(conn, cert); + + if (tlsCheckAgainstAllowlist(*commonName, allowlist)) { return true; + } /* If that fails, check through the subject alternative names (SANs) as well */ GENERAL_NAMES* subjectAltNames = (GENERAL_NAMES*)X509_get_ext_d2i(cert, NID_subject_alt_name, NULL, NULL); @@ -526,19 +598,19 @@ bool tlsValidateCertificateName(tls_connection* conn){ switch (generalName->type) { case GEN_EMAIL: - if (tlsCheckAgainstAllowlist(reinterpret_cast(ASN1_STRING_get0_data(generalName->d.rfc822Name)))){ + if (tlsCheckAgainstAllowlist(reinterpret_cast(ASN1_STRING_get0_data(generalName->d.rfc822Name)), allowlist)){ sk_GENERAL_NAME_pop_free(subjectAltNames, GENERAL_NAME_free); return true; } break; case GEN_DNS: - if (tlsCheckAgainstAllowlist(reinterpret_cast(ASN1_STRING_get0_data(generalName->d.dNSName)))){ + if (tlsCheckAgainstAllowlist(reinterpret_cast(ASN1_STRING_get0_data(generalName->d.dNSName)), allowlist)){ sk_GENERAL_NAME_pop_free(subjectAltNames, GENERAL_NAME_free); return true; } break; case GEN_URI: - if (tlsCheckAgainstAllowlist(reinterpret_cast(ASN1_STRING_get0_data(generalName->d.uniformResourceIdentifier)))){ + if (tlsCheckAgainstAllowlist(reinterpret_cast(ASN1_STRING_get0_data(generalName->d.uniformResourceIdentifier)), allowlist)){ sk_GENERAL_NAME_pop_free(subjectAltNames, GENERAL_NAME_free); return true; } @@ -549,7 +621,7 @@ bool tlsValidateCertificateName(tls_connection* conn){ if (ipLen == 4){ //IPv4 case char addr[INET_ADDRSTRLEN]; inet_ntop(AF_INET, ASN1_STRING_get0_data(generalName->d.iPAddress), addr, INET_ADDRSTRLEN); - if (tlsCheckAgainstAllowlist(addr)){ + if (tlsCheckAgainstAllowlist(addr, allowlist)){ sk_GENERAL_NAME_pop_free(subjectAltNames, GENERAL_NAME_free); return true; } @@ -565,14 +637,36 @@ bool tlsValidateCertificateName(tls_connection* conn){ sk_GENERAL_NAME_pop_free(subjectAltNames, GENERAL_NAME_free); } - /* If neither the CN nor the SANs match, update the SSL error and return false */ - conn->c.last_errno = 0; - if (conn->ssl_error) zfree(conn->ssl_error); - conn->ssl_error = (char*)zmalloc(512); - snprintf(conn->ssl_error, 512, "Client CN (%s) and SANs not found in allowlist.", commonName); return false; } +bool tlsCertificateRequiresAuditLogging(tls_connection* conn){ + const char* cn = ""; + if (tlsCheckCertificateAgainstAllowlist(conn, g_pserver->tls_auditlog_blocklist, &cn)) { + // Certificate is in exclusion list, no need to audit log + serverLog(LL_NOTICE, "Audit Log: disabled for %s", conn->c.fprint); + return false; + } else { + serverLog(LL_NOTICE, "Audit Log: enabled for %s", conn->c.fprint); + return true; + } +} + +bool tlsValidateCertificateName(tls_connection* conn){ + const char* cn = ""; + if (tlsCheckCertificateAgainstAllowlist(conn, g_pserver->tls_allowlist, &cn)) { + return true; + } else { + /* If neither the CN nor the SANs match, update the SSL error and return false */ + conn->c.last_errno = 0; + if (conn->ssl_error) zfree(conn->ssl_error); + size_t bufsize = 512; + conn->ssl_error = (char*)zmalloc(bufsize); + snprintf(conn->ssl_error, bufsize, "Client CN (%s) and SANs not found in allowlist.", cn); + return false; + } +} + static connection *createTLSConnection(int client_side) { SSL_CTX *ctx = redis_tls_ctx; if (client_side && redis_tls_client_ctx) @@ -795,6 +889,9 @@ void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.state = CONN_STATE_ERROR; } else { conn->c.state = CONN_STATE_CONNECTED; + if (tlsCertificateRequiresAuditLogging(conn)){ + conn->c.flags |= CONN_FLAG_AUDIT_LOGGING_REQUIRED; + } } } diff --git a/src/ziplist.c b/src/ziplist.c index dc828428b..7a18ae696 100644 --- a/src/ziplist.c +++ b/src/ziplist.c @@ -1704,17 +1704,17 @@ static unsigned char *createIntList() { unsigned char *zl = ziplistNew(); char buf[32]; - sprintf(buf, "100"); + snprintf(buf, sizeof(buf), "100"); zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); - sprintf(buf, "128000"); + snprintf(buf, sizeof(buf), "128000"); zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); - sprintf(buf, "-100"); + snprintf(buf, sizeof(buf), "-100"); zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); - sprintf(buf, "4294967296"); + snprintf(buf, sizeof(buf), "4294967296"); zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); - sprintf(buf, "non integer"); + snprintf(buf, sizeof(buf), "non integer"); zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); - sprintf(buf, "much much longer non integer"); + snprintf(buf, sizeof(buf), "much much longer non integer"); zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); return zl; } @@ -2228,7 +2228,7 @@ int ziplistTest(int argc, char **argv, int accurate) { char buf[32]; int i,len; for (i = 0; i < 1000; i++) { - len = sprintf(buf,"%d",i); + len = snprintf(buf,sizeof(buf),"%d",i); zl = ziplistPush(zl,(unsigned char*)buf,len,ZIPLIST_TAIL); } for (i = 0; i < 1000; i++) { @@ -2375,13 +2375,13 @@ int ziplistTest(int argc, char **argv, int accurate) { } else { switch(rand() % 3) { case 0: - buflen = sprintf(buf,"%lld",(0LL + rand()) >> 20); + buflen = snprintf(buf,sizeof(buf),"%lld",(0LL + rand()) >> 20); break; case 1: - buflen = sprintf(buf,"%lld",(0LL + rand())); + buflen = snprintf(buf,sizeof(buf),"%lld",(0LL + rand())); break; case 2: - buflen = sprintf(buf,"%lld",(0LL + rand()) << 20); + buflen = snprintf(buf,sizeof(buf),"%lld",(0LL + rand()) << 20); break; default: assert(NULL); @@ -2410,7 +2410,7 @@ int ziplistTest(int argc, char **argv, int accurate) { assert(ziplistGet(p,&sstr,&slen,&sval)); if (sstr == NULL) { - buflen = sprintf(buf,"%lld",sval); + buflen = snprintf(buf,sizeof(buf),"%lld",sval); } else { buflen = slen; memcpy(buf,sstr,buflen); diff --git a/src/zmalloc.cpp b/src/zmalloc.cpp index dac6f6631..81bd47f42 100644 --- a/src/zmalloc.cpp +++ b/src/zmalloc.cpp @@ -80,12 +80,12 @@ static_assert((PREFIX_SIZE % 16) == 0, "Our prefix must be modulo 16-bytes or ou #define realloc(ptr,size, type) tc_realloc(ptr,size) #define free(ptr) tc_free(ptr) #elif defined(USE_JEMALLOC) -#define malloc(size, type) je_malloc(size) -#define calloc(count,size,type) je_calloc(count,size) -#define realloc(ptr,size,type) je_realloc(ptr,size) -#define free(ptr) je_free(ptr) -#define mallocx(size,flags) je_mallocx(size,flags) -#define dallocx(ptr,flags) je_dallocx(ptr,flags) +#define malloc(size, type) malloc(size) +#define calloc(count,size,type) calloc(count,size) +#define realloc(ptr,size,type) realloc(ptr,size) +#define free(ptr) free(ptr) +#define mallocx(size,flags) mallocx(size,flags) +#define dallocx(ptr,flags) dallocx(ptr,flags) #else #define malloc(size, type) malloc(size) #define calloc(count,size,type) calloc(count,size) @@ -370,7 +370,7 @@ size_t zmalloc_get_rss(void) { int fd, count; char *p, *x; - snprintf(filename,256,"/proc/%ld/stat",(long) getpid()); + snprintf(filename,sizeof(filename),"/proc/%ld/stat",(long) getpid()); if ((fd = open(filename,O_RDONLY)) == -1) return 0; if (read(fd,buf,4096) <= 0) { close(fd); @@ -462,7 +462,7 @@ size_t zmalloc_get_rss(void) { char filename[256]; int fd; - snprintf(filename,256,"/proc/%ld/psinfo",(long) getpid()); + snprintf(filename,sizeof(filename),"/proc/%ld/psinfo",(long) getpid()); if ((fd = open(filename,O_RDONLY)) == -1) return 0; if (ioctl(fd, PIOCPSINFO, &info) == -1) { @@ -494,17 +494,17 @@ int zmalloc_get_allocator_info(size_t *allocated, *allocated = *resident = *active = 0; /* Update the statistics cached by mallctl. */ sz = sizeof(epoch); - je_mallctl("epoch", &epoch, &sz, &epoch, sz); + mallctl("epoch", &epoch, &sz, &epoch, sz); sz = sizeof(size_t); /* Unlike RSS, this does not include RSS from shared libraries and other non * heap mappings. */ - je_mallctl("stats.resident", resident, &sz, NULL, 0); + mallctl("stats.resident", resident, &sz, NULL, 0); /* Unlike resident, this doesn't not include the pages jemalloc reserves * for re-use (purge will clean that). */ - je_mallctl("stats.active", active, &sz, NULL, 0); + mallctl("stats.active", active, &sz, NULL, 0); /* Unlike zmalloc_used_memory, this matches the stats.resident by taking * into account all allocations done by this process (not only zmalloc). */ - je_mallctl("stats.allocated", allocated, &sz, NULL, 0); + mallctl("stats.allocated", allocated, &sz, NULL, 0); return 1; } @@ -512,7 +512,7 @@ void set_jemalloc_bg_thread(int enable) { /* let jemalloc do purging asynchronously, required when there's no traffic * after flushdb */ char val = !!enable; - je_mallctl("background_thread", NULL, 0, &val, 1); + mallctl("background_thread", NULL, 0, &val, 1); } int jemalloc_purge() { @@ -520,9 +520,9 @@ int jemalloc_purge() { char tmp[32]; unsigned narenas = 0; size_t sz = sizeof(unsigned); - if (!je_mallctl("arenas.narenas", &narenas, &sz, NULL, 0)) { - sprintf(tmp, "arena.%d.purge", narenas); - if (!je_mallctl(tmp, NULL, 0, NULL, 0)) + if (!mallctl("arenas.narenas", &narenas, &sz, NULL, 0)) { + snprintf(tmp, sizeof(tmp), "arena.%d.purge", narenas); + if (!mallctl(tmp, NULL, 0, NULL, 0)) return 0; } return -1; diff --git a/src/zmalloc.h b/src/zmalloc.h index 7816bbae0..75b205b96 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -55,7 +55,7 @@ #include #if (JEMALLOC_VERSION_MAJOR == 2 && JEMALLOC_VERSION_MINOR >= 1) || (JEMALLOC_VERSION_MAJOR > 2) #define HAVE_MALLOC_SIZE 1 -#define zmalloc_size(p) je_malloc_usable_size(p) +#define zmalloc_size(p) malloc_usable_size(p) #else #error "Newer version of jemalloc required" #endif diff --git a/tests/instances.tcl b/tests/instances.tcl index d4270d091..db267f54d 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -21,6 +21,7 @@ set ::tls 0 set ::pause_on_error 0 set ::dont_clean 0 set ::simulate_error 0 +set ::flash 0 set ::failed 0 set ::sentinel_instances {} set ::redis_instances {} @@ -81,6 +82,10 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} { set cfg [open $cfgfile w] } + if {$::flash} { + puts $cfg "storage-provider flash ./flash_$base_port" + } + if {$::tls} { puts $cfg "tls-port $port" puts $cfg "tls-replication yes" @@ -266,6 +271,8 @@ proc parse_options {} { set val2 [lindex $::argv [expr $j+2]] dict set ::global_config $val $val2 incr j 2 + } elseif {$opt eq {--flash}} { + set ::flash 1 } elseif {$opt eq "--help"} { puts "--single Only runs tests specified by pattern." puts "--dont-clean Keep log files on exit." @@ -275,6 +282,7 @@ proc parse_options {} { puts "--tls Run tests in TLS mode." puts "--host Use hostname instead of 127.0.0.1." puts "--config Extra config argument(s)." + puts "--flash Run the whole suite with flash enabled" puts "--help Shows this help." exit 0 } else { @@ -301,11 +309,15 @@ proc pause_on_error {} { break } elseif {$cmd eq {show-keydb-logs}} { set count 10 + set instance {} if {[lindex $argv 1] ne {}} {set count [lindex $argv 1]} + if {[lindex $argv 2] ne {}} {set instance [lindex $argv 2]} foreach_redis_id id { - puts "=== KeyDB $id ====" - puts [exec tail -$count redis_$id/log.txt] - puts "---------------------\n" + if {$instance eq $id || $instance eq {}} { + puts "=== KeyDB $id ====" + puts [exec tail -$count redis_$id/log.txt] + puts "---------------------\n" + } } } elseif {$cmd eq {show-sentinel-logs}} { set count 10 @@ -350,7 +362,7 @@ proc pause_on_error {} { } elseif {$cmd eq {help}} { puts "ls List Sentinel and KeyDB instances." puts "show-sentinel-logs \[N\] Show latest N lines of logs." - puts "show-keydb-logs \[N\] Show latest N lines of logs." + puts "show-keydb-logs \[N\] \[id\] Show latest N lines of logs of server id." puts "S cmd ... arg Call command in Sentinel ." puts "R cmd ... arg Call command in KeyDB ." puts "SI Show Sentinel INFO ." diff --git a/tests/integration/psync2-reg-multimaster.tcl b/tests/integration/psync2-reg-multimaster.tcl index 2e9b1187b..97e4e1587 100644 --- a/tests/integration/psync2-reg-multimaster.tcl +++ b/tests/integration/psync2-reg-multimaster.tcl @@ -53,12 +53,12 @@ start_server {overrides {active-replica {yes} multi-master {yes} client-output-b set elapsed [expr {[clock milliseconds]-$cycle_start_time}] if {$elapsed > $duration*1000} break if {rand() < .05} { - test "PSYNC2 #3899 regression: kill first replica" { + test "PSYNC2 #3899 regression (multi-master): kill first replica" { $R(1) client kill type master } } if {rand() < .05} { - test "PSYNC2 #3899 regression: kill chained replica" { + test "PSYNC2 #3899 regression (multi-master): kill chained replica" { $R(2) client kill type master } } @@ -99,4 +99,8 @@ start_server {overrides {active-replica {yes} multi-master {yes} client-output-b fail [format "The three instances have different data sets:\n\tnode 1: %s\n\tnode 2: %s\n\tnode 3: %s\nRun diff -u against /tmp/repldump*.txt for more info" [$R(0) debug digest] [$R(1) debug digest] [$R(2) debug digest] ] } } + + assert {[s -2 sync_partial_ok] > 0} + assert {[s -1 sync_partial_ok] > 0} + assert {[s 0 sync_partial_ok] > 0} }}} diff --git a/tests/integration/replication-fast.tcl b/tests/integration/replication-fast.tcl index a7368ff2c..e0da65edd 100644 --- a/tests/integration/replication-fast.tcl +++ b/tests/integration/replication-fast.tcl @@ -6,29 +6,31 @@ proc prepare_value {size} { return $_v } -start_server {tags {"replication-fast"} overrides {storage-provider {flash ./rocks.db.master} databases 256}} { - set slave [srv 0 client] - set slave_host [srv 0 host] - set slave_port [srv 0 port] - start_server {tags {} overrides {storage-provider {flash ./rocks.db.replica} databases 256}} { - set master [srv 0 client] - set master_host [srv 0 host] - set master_port [srv 0 port] +if {$::flash_enabled} { + start_server {tags {"replication-fast"} overrides {storage-provider {flash ./rocks.db.master} databases 256}} { + set slave [srv 0 client] + set slave_host [srv 0 host] + set slave_port [srv 0 port] + start_server {tags {} overrides {storage-provider {flash ./rocks.db.replica} databases 256}} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] - test "fast replication with large value" { - set _v [prepare_value [expr 64*1024*1024]] - # $master set key $_v + test "fast replication with large value" { + set _v [prepare_value [expr 64*1024*1024]] + # $master set key $_v - $slave replicaof $master_host $master_port - wait_for_condition 50 300 { - [lindex [$slave role] 0] eq {slave} && - [string match {*master_link_status:up*} [$slave info replication]] - } else { - fail "Can't turn the instance into a replica" - } + $slave replicaof $master_host $master_port + wait_for_condition 50 300 { + [lindex [$slave role] 0] eq {slave} && + [string match {*master_link_status:up*} [$slave info replication]] + } else { + fail "Can't turn the instance into a replica" + } - assert_equal [$slave debug digest] [$master debug digest] - $slave replicaof no one + assert_equal [$slave debug digest] [$master debug digest] + $slave replicaof no one + } } } } diff --git a/tests/integration/replication-psync-flash.tcl b/tests/integration/replication-psync-flash.tcl index 8aa1ff010..84e877c6b 100644 --- a/tests/integration/replication-psync-flash.tcl +++ b/tests/integration/replication-psync-flash.tcl @@ -8,128 +8,130 @@ # If reconnect is > 0, the test actually try to break the connection and # reconnect with the master, otherwise just the initial synchronization is # checked for consistency. -proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect} { - start_server [list tags {"repl"} overrides [list storage-provider {flash .rocks.db.m} repl-backlog-size 64m]] { - start_server [list tags {flash} overrides [list storage-provider {flash ./rocks.db} delete-on-evict no storage-flush-period 10]] { +if {$::flash_enabled} { + proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect} { + start_server [list tags {"repl"} overrides [list storage-provider {flash .rocks.db.m} repl-backlog-size 64m]] { + start_server [list tags {flash} overrides [list storage-provider {flash ./rocks.db} delete-on-evict no storage-flush-period 10]] { - set master [srv -1 client] - set master_host [srv -1 host] - set master_port [srv -1 port] - set slave [srv 0 client] + set master [srv -1 client] + set master_host [srv -1 host] + set master_port [srv -1 port] + set slave [srv 0 client] - set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000] - set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000] - set load_handle2 [start_bg_complex_data $master_host $master_port 12 100000] + set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000] + set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000] + set load_handle2 [start_bg_complex_data $master_host $master_port 12 100000] - test {Slave should be able to synchronize with the master} { - $slave slaveof $master_host $master_port - wait_for_condition 50 100 { - [lindex [r role] 0] eq {slave} && - [lindex [r role] 3] eq {connected} - } else { - fail "Replication not started." + test {Slave should be able to synchronize with the master} { + $slave slaveof $master_host $master_port + wait_for_condition 50 100 { + [lindex [r role] 0] eq {slave} && + [lindex [r role] 3] eq {connected} + } else { + fail "Replication not started." + } } - } - # Check that the background clients are actually writing. - test {Detect write load to master} { - wait_for_condition 50 1000 { - [$master dbsize] > 100 - } else { - fail "Can't detect write load from background clients." + # Check that the background clients are actually writing. + test {Detect write load to master} { + wait_for_condition 50 1000 { + [$master dbsize] > 100 + } else { + fail "Can't detect write load from background clients." + } } - } - test "Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect)" { - # Now while the clients are writing data, break the maste-slave - # link multiple times. - if ($reconnect) { - for {set j 0} {$j < $duration*10} {incr j} { - after 100 - # catch {puts "MASTER [$master dbsize] keys, REPLICA [$slave dbsize] keys"} + test "Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect)" { + # Now while the clients are writing data, break the maste-slave + # link multiple times. + if ($reconnect) { + for {set j 0} {$j < $duration*10} {incr j} { + after 100 + # catch {puts "MASTER [$master dbsize] keys, REPLICA [$slave dbsize] keys"} - if {($j % 20) == 0} { - catch { - $slave debug restart + if {($j % 20) == 0} { + catch { + $slave debug restart + } } } } - } - stop_bg_complex_data $load_handle0 - stop_bg_complex_data $load_handle1 - stop_bg_complex_data $load_handle2 + stop_bg_complex_data $load_handle0 + stop_bg_complex_data $load_handle1 + stop_bg_complex_data $load_handle2 - # Wait for the slave to reach the "online" - # state from the POV of the master. - set retry 5000 - while {$retry} { - set info [$master info] - if {[string match {*slave0:*state=online*} $info]} { - break - } else { - incr retry -1 - after 100 + # Wait for the slave to reach the "online" + # state from the POV of the master. + set retry 5000 + while {$retry} { + set info [$master info] + if {[string match {*slave0:*state=online*} $info]} { + break + } else { + incr retry -1 + after 100 + } + } + if {$retry == 0} { + error "assertion:Slave not correctly synchronized" } - } - if {$retry == 0} { - error "assertion:Slave not correctly synchronized" - } - # Wait that slave acknowledge it is online so - # we are sure that DBSIZE and DEBUG DIGEST will not - # fail because of timing issues. (-LOADING error) - wait_for_condition 5000 100 { - [lindex [$slave role] 3] eq {connected} - } else { - fail "Slave still not connected after some time" - } + # Wait that slave acknowledge it is online so + # we are sure that DBSIZE and DEBUG DIGEST will not + # fail because of timing issues. (-LOADING error) + wait_for_condition 5000 100 { + [lindex [$slave role] 3] eq {connected} + } else { + fail "Slave still not connected after some time" + } - set retry 20 - while {$retry && ([$master debug digest] ne [$slave debug digest])}\ - { - after 1000 - incr retry -1 - } - assert {[$master dbsize] > 0} + set retry 20 + while {$retry && ([$master debug digest] ne [$slave debug digest])}\ + { + after 1000 + incr retry -1 + } + assert {[$master dbsize] > 0} - if {[$master debug digest] ne [$slave debug digest]} { - set csv1 [csvdump r] - set csv2 [csvdump {r -1}] - set fd [open /tmp/repldump1.txt w] - puts -nonewline $fd $csv1 - close $fd - set fd [open /tmp/repldump2.txt w] - puts -nonewline $fd $csv2 - close $fd - puts "Master - Replica inconsistency" - puts "Run diff -u against /tmp/repldump*.txt for more info" + if {[$master debug digest] ne [$slave debug digest]} { + set csv1 [csvdump r] + set csv2 [csvdump {r -1}] + set fd [open /tmp/repldump1.txt w] + puts -nonewline $fd $csv1 + close $fd + set fd [open /tmp/repldump2.txt w] + puts -nonewline $fd $csv2 + close $fd + puts "Master - Replica inconsistency" + puts "Run diff -u against /tmp/repldump*.txt for more info" + } + assert_equal [r debug digest] [r -1 debug digest] + eval $cond } - assert_equal [r debug digest] [r -1 debug digest] - eval $cond } } } -} -foreach mdl {no yes} { - foreach sdl {disabled swapdb} { - test_psync {no reconnection, just sync} 6 1000000 3600 0 { - } $mdl $sdl 0 + foreach mdl {no yes} { + foreach sdl {disabled swapdb} { + test_psync {no reconnection, just sync} 6 1000000 3600 0 { + } $mdl $sdl 0 - test_psync {ok psync} 6 100000000 3600 0 { - assert {[s -1 sync_partial_ok] > 0} - } $mdl $sdl 1 + test_psync {ok psync} 6 100000000 3600 0 { + assert {[s -1 sync_partial_ok] > 0} + } $mdl $sdl 1 - test_psync {no backlog} 6 100 3600 0.5 { - assert {[s -1 sync_partial_err] > 0} - } $mdl $sdl 1 + test_psync {no backlog} 6 100 3600 0.5 { + assert {[s -1 sync_partial_err] > 0} + } $mdl $sdl 1 - test_psync {ok after delay} 3 100000000 3600 3 { - assert {[s -1 sync_partial_ok] > 0} - } $mdl $sdl 1 + test_psync {ok after delay} 3 100000000 3600 3 { + assert {[s -1 sync_partial_ok] > 0} + } $mdl $sdl 1 - test_psync {backlog expired} 3 100000000 1 3 { - assert {[s -1 sync_partial_err] > 0} - } $mdl $sdl 1 + test_psync {backlog expired} 3 100000000 1 3 { + assert {[s -1 sync_partial_err] > 0} + } $mdl $sdl 1 + } } } diff --git a/tests/modules/Makefile b/tests/modules/Makefile index 1beb217b8..b630508cb 100644 --- a/tests/modules/Makefile +++ b/tests/modules/Makefile @@ -39,7 +39,8 @@ TEST_MODULES = \ defragtest.so \ hash.so \ zset.so \ - stream.so + stream.so \ + load.so .PHONY: all diff --git a/tests/modules/hooks.c b/tests/modules/hooks.c index 3b69ac27a..185f3d5e0 100644 --- a/tests/modules/hooks.c +++ b/tests/modules/hooks.c @@ -212,6 +212,7 @@ void loadingCallback(RedisModuleCtx *ctx, RedisModuleEvent e, uint64_t sub, void case REDISMODULE_SUBEVENT_LOADING_RDB_START: keyname = "loading-rdb-start"; break; case REDISMODULE_SUBEVENT_LOADING_AOF_START: keyname = "loading-aof-start"; break; case REDISMODULE_SUBEVENT_LOADING_REPL_START: keyname = "loading-repl-start"; break; + case REDISMODULE_SUBEVENT_LOADING_FLASH_START: keyname = "loading-flash-start"; break; case REDISMODULE_SUBEVENT_LOADING_ENDED: keyname = "loading-end"; break; case REDISMODULE_SUBEVENT_LOADING_FAILED: keyname = "loading-failed"; break; } diff --git a/tests/modules/load.c b/tests/modules/load.c new file mode 100644 index 000000000..a70579b6f --- /dev/null +++ b/tests/modules/load.c @@ -0,0 +1,94 @@ +/* Server hooks API example + * + * ----------------------------------------------------------------------------- + * + * Copyright (c) 2019, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#define REDISMODULE_EXPERIMENTAL_API +#include "redismodule.h" + +size_t count, finalCount; + +/* Client state change callback. */ +void loadCallback(RedisModuleCtx *ctx, RedisModuleEvent e, uint64_t sub, void *data) { + REDISMODULE_NOT_USED(ctx); + REDISMODULE_NOT_USED(e); + REDISMODULE_NOT_USED(data); + + if (sub == REDISMODULE_SUBEVENT_LOADING_FLASH_START || sub == REDISMODULE_SUBEVENT_LOADING_RDB_START || sub == REDISMODULE_SUBEVENT_LOADING_AOF_START || sub == REDISMODULE_SUBEVENT_LOADING_REPL_START) { + count = 0; + finalCount = 0; + } else if (sub == REDISMODULE_SUBEVENT_LOADING_ENDED) { + finalCount = count; + } +} + +int loadKeyCallback(RedisModuleCtx *ctx, int type, const char *event, RedisModuleString *key) { + REDISMODULE_NOT_USED(ctx); + REDISMODULE_NOT_USED(type); + REDISMODULE_NOT_USED(event); + + const char *keyname = RedisModule_StringPtrLen(key, NULL); + + RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_NOTICE, "Loaded key: %s", keyname); + + count++; + return 0; +} + +int LoadCount_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + REDISMODULE_NOT_USED(argv); + RedisModule_AutoMemory(ctx); /* Use automatic memory management. */ + + if (argc != 1) return RedisModule_WrongArity(ctx); + + RedisModule_ReplyWithLongLong(ctx, finalCount); + + return REDISMODULE_OK; +} + +/* This function must be present on each Redis module. It is used in order to + * register the commands into the Redis server. */ +int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + REDISMODULE_NOT_USED(argv); + REDISMODULE_NOT_USED(argc); + + if (RedisModule_Init(ctx,"load",1,REDISMODULE_APIVER_1) + == REDISMODULE_ERR) return REDISMODULE_ERR; + + RedisModule_SubscribeToServerEvent(ctx, + RedisModuleEvent_Loading, loadCallback); + RedisModule_SubscribeToKeyspaceEvents(ctx, + REDISMODULE_NOTIFY_LOADED, loadKeyCallback); + + if (RedisModule_CreateCommand(ctx, "load.count", + LoadCount_RedisCommand,"readonly",1,1,1) == REDISMODULE_ERR) + return REDISMODULE_ERR; + return REDISMODULE_OK; +} diff --git a/tests/support/keydb.tcl b/tests/support/keydb.tcl index 978163e98..0cfd2a3e2 100644 --- a/tests/support/keydb.tcl +++ b/tests/support/keydb.tcl @@ -66,6 +66,33 @@ proc redis {{server 127.0.0.1} {port 6379} {defer 0} {tls 0} {tlsoptions {}} {re interp alias {} ::redis::redisHandle$id {} ::redis::__dispatch__ $id } +# On recent versions of tcl-tls/OpenSSL, reading from a dropped connection +# results with an error we need to catch and mimic the old behavior. +proc ::redis::redis_safe_read {fd len} { + if {$len == -1} { + set err [catch {set val [read $fd]} msg] + } else { + set err [catch {set val [read $fd $len]} msg] + } + if {!$err} { + return $val + } + if {[string match "*connection abort*" $msg]} { + return {} + } + error $msg +} + +proc ::redis::redis_safe_gets {fd} { + if {[catch {set val [gets $fd]} msg]} { + if {[string match "*connection abort*" $msg]} { + return {} + } + error $msg + } + return $val +} + # This is a wrapper to the actual dispatching procedure that handles # reconnection if needed. proc ::redis::__dispatch__ {id method args} { @@ -146,6 +173,10 @@ proc ::redis::__method__read {id fd} { ::redis::redis_read_reply $id $fd } +proc ::redis::__method__rawread {id fd {len -1}} { + return [redis_safe_read $fd $len] +} + proc ::redis::__method__write {id fd buf} { ::redis::redis_write $fd $buf } @@ -192,8 +223,8 @@ proc ::redis::redis_writenl {fd buf} { } proc ::redis::redis_readnl {fd len} { - set buf [read $fd $len] - read $fd 2 ; # discard CR LF + set buf [redis_safe_read $fd $len] + redis_safe_read $fd 2 ; # discard CR LF return $buf } @@ -239,11 +270,11 @@ proc ::redis::redis_read_map {id fd} { } proc ::redis::redis_read_line fd { - string trim [gets $fd] + string trim [redis_safe_gets $fd] } proc ::redis::redis_read_null fd { - gets $fd + redis_safe_gets $fd return {} } @@ -260,7 +291,7 @@ proc ::redis::redis_read_reply {id fd} { } while {1} { - set type [read $fd 1] + set type [redis_safe_read $fd 1] switch -exact -- $type { _ {return [redis_read_null $fd]} : - diff --git a/tests/support/test.tcl b/tests/support/test.tcl index d5a7f9dc5..b814758e3 100644 --- a/tests/support/test.tcl +++ b/tests/support/test.tcl @@ -40,6 +40,12 @@ proc assert_failed {expected_err detail} { error "assertion:$expected_err $detail" } +proc assert_not_equal {value expected {detail ""}} { + if {!($expected ne $value)} { + assert_failed "Expected '$value' not equal to '$expected'" $detail + } +} + proc assert_equal {value expected {detail ""}} { if {$expected ne $value} { assert_failed "Expected '$value' to be equal to '$expected'" $detail diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 3b5bcf094..82f8e96b4 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -47,6 +47,7 @@ set ::all_tests { integration/replication-3 integration/replication-4 integration/replication-psync + integration/replication-psync-flash integration/replication-active integration/replication-multimaster integration/replication-multimaster-connect @@ -59,11 +60,13 @@ set ::all_tests { integration/failover integration/keydb-cli integration/keydb-benchmark + integration/replication-fast integration/replication-psync-multimaster unit/pubsub unit/slowlog unit/scripting unit/maxmemory + unit/flash unit/introspection unit/introspection-2 unit/limits @@ -78,6 +81,7 @@ set ::all_tests { unit/pendingquerybuf unit/tls unit/tls-name-validation + unit/tls-auditlog unit/tracking unit/oom-score-adj unit/shutdown @@ -86,6 +90,7 @@ set ::all_tests { integration/logging integration/corrupt-dump integration/corrupt-dump-fuzzer + unit/soft_shutdown } # Index to the next test to run in the ::all_tests list. set ::next_test 0 @@ -735,6 +740,16 @@ for {set j 0} {$j < [llength $argv]} {incr j} { } } +# Check if we compiled with flash +set status [catch {exec src/keydb-server --is-flash-enabled}] +if {$status == 0} { + puts "KeyDB was built with FLASH, including FLASH tests" + set ::flash_enabled 1 +} else { + puts "KeyDB was not built with FLASH. Excluding FLASH tests" + set ::flash_enabled 0 +} + set filtered_tests {} # Set the filtered tests to be the short list (single_tests) if exists. diff --git a/tests/unit/flash.tcl b/tests/unit/flash.tcl index ea3c8e9fe..e66248fb2 100644 --- a/tests/unit/flash.tcl +++ b/tests/unit/flash.tcl @@ -1,170 +1,171 @@ -start_server [list tags {flash} overrides [list storage-provider {flash ./rocks.db} delete-on-evict no storage-flush-period 10]] { +if {$::flash_enabled} { + start_server [list tags {flash} overrides [list storage-provider {flash ./rocks.db} delete-on-evict no storage-flush-period 10]] { - test { FLASH - GET works after eviction } { - r set testkey foo - r flushall cache - assert_equal {foo} [r get testkey] - } + test { FLASH - GET works after eviction } { + r set testkey foo + r flushall cache + assert_equal {foo} [r get testkey] + } - test { DEL of nonexistant key returns 0 } { - r flushall - assert_equal {0} [r del foobar] - assert_equal {0} [r dbsize] "Key count is accurate after non-existant delete" - } + test { DEL of nonexistant key returns 0 } { + r flushall + assert_equal {0} [r del foobar] + assert_equal {0} [r dbsize] "Key count is accurate after non-existant delete" + } - test { DEL of flushed key works } { - r flushall - r set testkey foo - assert_equal {1} [r dbsize] "Only one key after first insert" - r flushall cache - assert_equal {foo} [r get testkey] "Value still there after flushing cache" - r del testkey - assert_equal {0} [r dbsize] "No keys after delete" - } + test { DEL of flushed key works } { + r flushall + r set testkey foo + assert_equal {1} [r dbsize] "Only one key after first insert" + r flushall cache + assert_equal {foo} [r get testkey] "Value still there after flushing cache" + r del testkey + assert_equal {0} [r dbsize] "No keys after delete" + } - test { SET of existing but flushed key works } { + test { SET of existing but flushed key works } { + r flushall + r set testkey foo + assert_equal {1} [r dbsize] "Only one key after first insert" + r flushall cache + assert_equal {1} [r dbsize] "Only one key after flushall cache" + r set testkey bar + assert_equal {1} [r dbsize] "Only one key after overwrite" + assert_equal {bar} [r get testkey] + } + + test { SET of existing but flushed key with EXPIRE works } { r flushall - r set testkey foo + assert_equal {0} [r dbsize] + r set testkey foo ex 10000 assert_equal {1} [r dbsize] "Only one key after first insert" r flushall cache assert_equal {1} [r dbsize] "Only one key after flushall cache" - r set testkey bar + r set testkey bar ex 10000 assert_equal {1} [r dbsize] "Only one key after overwrite" assert_equal {bar} [r get testkey] - } - - test { SET of existing but flushed key with EXPIRE works } { - r flushall - assert_equal {0} [r dbsize] - r set testkey foo ex 10000 - assert_equal {1} [r dbsize] "Only one key after first insert" - r flushall cache - assert_equal {1} [r dbsize] "Only one key after flushall cache" - r set testkey bar ex 10000 - assert_equal {1} [r dbsize] "Only one key after overwrite" - assert_equal {bar} [r get testkey] - assert [expr [r ttl testkey] > 0] - } - - test { EXPIRE of existing but flushed key } { - r flushall - assert_equal {0} [r dbsize] - r set testkey foo - assert_equal {1} [r dbsize] - r flushall cache - r expire testkey 10000 - assert_equal {1} [r dbsize] - assert_equal {foo} [r get testkey] - assert [expr [r ttl testkey] > 0] - } + assert [expr [r ttl testkey] > 0] + } - test { CREATE and UPDATE in transaction, key count is accurate } { + test { EXPIRE of existing but flushed key } { r flushall - r multi - r set testkey 2 - r incr testkey - r exec + assert_equal {0} [r dbsize] + r set testkey foo assert_equal {1} [r dbsize] - assert_equal {3} [r get testkey] - } - - test { EXPIRE key count is accurate } { - r flushall - r set testkey foo ex 1 r flushall cache + r expire testkey 10000 assert_equal {1} [r dbsize] - after 1500 - assert_equal {0} [r dbsize] - } + assert_equal {foo} [r get testkey] + assert [expr [r ttl testkey] > 0] + } - test { SUBKEY EXPIRE persists after cache flush } { - r flushall - r sadd testkey foo bar baz - r expiremember testkey foo 10000 - r flushall cache - assert [expr [r ttl testkey foo] > 0] - } + test { CREATE and UPDATE in transaction, key count is accurate } { + r flushall + r multi + r set testkey 2 + r incr testkey + r exec + assert_equal {1} [r dbsize] + assert_equal {3} [r get testkey] + } - test { LIST pop works after flushing cache } { - r flushall - r lpush testkey foo - r flushall cache - assert_equal {foo} [r lpop testkey] - } + test { EXPIRE key count is accurate } { + r flushall + r set testkey foo ex 1 + r flushall cache + assert_equal {1} [r dbsize] + after 1500 + assert_equal {0} [r dbsize] + } - test { DIGEST string the same after flushing cache } { - r flushall - r set testkey foo - r set testkey1 foo ex 10000 - set expectedDigest [r debug digest] - r flushall cache - assert_equal $expectedDigest [r debug digest] - } + test { SUBKEY EXPIRE persists after cache flush } { + r flushall + r sadd testkey foo bar baz + r expiremember testkey foo 10000 + r flushall cache + assert [expr [r ttl testkey foo] > 0] + } - test { DIGEST list the same after flushing cache } { - r flushall - r lpush testkey foo bar - set expectedDigest [r debug digest] - r flushall cache - assert_equal $expectedDigest [r debug digest] - } + test { LIST pop works after flushing cache } { + r flushall + r lpush testkey foo + r flushall cache + assert_equal {foo} [r lpop testkey] + } - test { DELETE of flushed set member persists after another flush } { - r flushall - r sadd set1 val1 val2 val3 - assert_equal {3} [r scard set1] - r flushall cache - r srem set1 val1 - r flushall cache - assert_equal {2} [r scard set1] - } + test { DIGEST string the same after flushing cache } { + r flushall + r set testkey foo + r set testkey1 foo ex 10000 + set expectedDigest [r debug digest] + r flushall cache + assert_equal $expectedDigest [r debug digest] + } + + test { DIGEST list the same after flushing cache } { + r flushall + r lpush testkey foo bar + set expectedDigest [r debug digest] + r flushall cache + assert_equal $expectedDigest [r debug digest] + } + + test { DELETE of flushed set member persists after another flush } { + r flushall + r sadd set1 val1 val2 val3 + assert_equal {3} [r scard set1] + r flushall cache + r srem set1 val1 + r flushall cache + assert_equal {2} [r scard set1] + } - r flushall - # If a weak storage memory model is set, wait for any pending snapshot writes to finish - after 500 - foreach policy { - allkeys-random allkeys-lru allkeys-lfu - } { - test "FLASH - is eviction working without data loss (successfully stored to flash)? (policy $policy)" { - # Get the current memory limit and calculate a new limit. - # Set limit to 100M. - set used [s used_memory] - set limit [expr {$used+60*1024*1024}] - r config set maxmemory $limit - r config set maxmemory-policy $policy - # Now add keys equivalent to 1024b until the limit is almost reached. - set numkeys 0 - r set first val - while 1 { - r set $numkeys xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx - incr numkeys - if {[s used_memory]+1024 >= $limit} { - break + r flushall + # If a weak storage memory model is set, wait for any pending snapshot writes to finish + after 500 + foreach policy { + allkeys-random allkeys-lru allkeys-lfu + } { + test "FLASH - is eviction working without data loss (successfully stored to flash)? (policy $policy)" { + # Get the current memory limit and calculate a new limit. + # Set limit to 100M. + set used [s used_memory] + set limit [expr {$used+60*1024*1024}] + r config set maxmemory $limit + r config set maxmemory-policy $policy + # Now add keys equivalent to 1024b until the limit is almost reached. + set numkeys 0 + r set first val + while 1 { + r set $numkeys xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + incr numkeys + if {[s used_memory]+1024 >= $limit} { + break + } } + # Add additional keys to force eviction + # should still be under the limit for maxmemory, however all keys set should still exist between flash and memory + # check same number of keys exist in addition to values of first and last keys + set err 0 + set extra_keys [expr floor([expr ($limit * 0.4) / 1024])] + for {set j 0} {$j < $extra_keys} {incr j} { + catch { + r set p2$j xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + } err + assert {$err == {OK}} + } + if {[log_file_matches [srv 0 stdout] "*Failed to evict*"]} { + fail "Server did not evict cleanly (detected full flush)" + } + r set last val + set dbsize [r dbsize] + assert {[s used_memory] < ($limit*1.2)} + assert {$dbsize == $numkeys+$extra_keys+2} + assert {[r get first] == {val}} + assert {[r get last] == {val}} + r flushall } - # Add additional keys to force eviction - # should still be under the limit for maxmemory, however all keys set should still exist between flash and memory - # check same number of keys exist in addition to values of first and last keys - set err 0 - set extra_keys [expr floor([expr ($limit * 0.4) / 1024])] - for {set j 0} {$j < $extra_keys} {incr j} { - catch { - r set p2$j xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx - } err - assert {$err == {OK}} - } - if {[log_file_matches [srv 0 stdout] "*Failed to evict*"]} { - fail "Server did not evict cleanly (detected full flush)" - } - r set last val - set dbsize [r dbsize] - assert {[s used_memory] < ($limit*1.2)} - assert {$dbsize == $numkeys+$extra_keys+2} - assert {[r get first] == {val}} - assert {[r get last] == {val}} - r flushall } - } + } } - diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 66414c85f..7678a1dd5 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -162,15 +162,16 @@ start_server {tags {"introspection"}} { aof_rewrite_cpulist time-thread-priority bgsave_cpulist - storage-cache-mode - storage-provider-options - use-fork + storage-cache-mode + storage-provider-options + use-fork multi-master active-replica bind set-proc-title repl-backlog-disk-reserve tls-allowlist + db-s3-object } if {!$::tls} { diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index db18e7128..a30129e4c 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -49,10 +49,13 @@ start_server {tags {"defrag"} overrides {appendonly yes auto-aof-rewrite-percent r config set maxmemory 100mb r config set maxmemory-policy allkeys-lru - populate 700000 asdf1 150 - populate 170000 asdf2 300 - after 120 ;# serverCron only updates the info once in 100ms - set frag [s allocator_frag_ratio] + for {set i 0} {$i < 10} {incr i} { + populate 700000 asdf1 150 + populate 170000 asdf2 300 + after 120 ;# serverCron only updates the info once in 100ms + set frag [s allocator_frag_ratio] + if {$frag >= 1.4} {break} + } if {$::verbose} { puts "frag $frag" } @@ -507,11 +510,14 @@ start_server {tags {"defrag"} overrides {server-threads 1 active-replica yes} } r config set active-defrag-ignore-bytes 2mb r config set maxmemory 100mb r config set maxmemory-policy allkeys-lru - populate 700000 asdf1 150 - populate 170000 asdf2 300 - r ping ;# trigger eviction following the previous population - after 120 ;# serverCron only updates the info once in 100ms - set frag [s allocator_frag_ratio] + for {set i 0} {$i < 10} {incr i} { + populate 700000 asdf1 150 + populate 170000 asdf2 300 + r ping ;# trigger eviction following the previous population + after 120 ;# serverCron only updates the info once in 100ms + set frag [s allocator_frag_ratio] + if {$frag >= 1.4} {break} + } if {$::verbose} { puts "frag $frag" } diff --git a/tests/unit/moduleapi/load.tcl b/tests/unit/moduleapi/load.tcl new file mode 100644 index 000000000..853b9aebb --- /dev/null +++ b/tests/unit/moduleapi/load.tcl @@ -0,0 +1,15 @@ +set testmodule [file normalize tests/modules/load.so] + +if {$::flash_enabled} { + start_server [list tags [list "modules"] overrides [list storage-provider {flash ./rocks.db.master} databases 256 loadmodule $testmodule]] { + test "Module is notified of keys loaded from flash" { + r flushall + r set foo bar + r set bar foo + r set foobar barfoo + assert_equal [r load.count] 0 + r debug reload + assert_equal [r load.count] 3 + } + } +} \ No newline at end of file diff --git a/tests/unit/obuf-limits.tcl b/tests/unit/obuf-limits.tcl index bbb9fcbf6..0cefb9129 100644 --- a/tests/unit/obuf-limits.tcl +++ b/tests/unit/obuf-limits.tcl @@ -111,7 +111,7 @@ start_server {tags {"obuf-limits"} overrides { server-threads 1 }} { # Read nothing set fd [$rd channel] - assert_equal {} [read $fd] + assert_equal {} [$rd rawread] } # Note: This test assumes that what's written with one write, will be read by redis in one read. @@ -151,8 +151,7 @@ start_server {tags {"obuf-limits"} overrides { server-threads 1 }} { assert_equal "PONG" [r ping] set clients [r client list] assert_no_match "*name=multicommands*" $clients - set fd [$rd2 channel] - assert_equal {} [read $fd] + assert_equal {} [$rd2 rawread] } test {Execute transactions completely even if client output buffer limit is enforced} { @@ -183,4 +182,12 @@ start_server {tags {"obuf-limits"} overrides { server-threads 1 }} { assert_equal "v2" [r get k2] assert_equal "v3" [r get k3] } + + test "Obuf limit, HRANDFIELD with huge count stopped mid-run" { + r config set client-output-buffer-limit {normal 1000000 0 0} + r hset myhash a b + catch {r hrandfield myhash -999999999} e + assert_match "*I/O error*" $e + reconnect + } } diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index e4af19d5c..561bcf7e1 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -43,6 +43,15 @@ start_server {tags {"scripting"}} { r eval {return redis.call('get',KEYS[1])} 1 mykey } {myval} + test {EVAL - keys command works? } { + r eval {return redis.call('keys', 'test')} 0 + } + + test {EVAL - KeyDB global works } { + r eval {return keydb.call('get', KEYS[1])} 1 mykey + assert_equal [r eval {return redis.call('get',KEYS[1])} 1 mykey] [r eval {return keydb.call('get', KEYS[1])} 1 mykey] + } + test {EVALSHA - Can we call a SHA1 if already defined?} { r evalsha fd758d1589d044dd850a6f05d52f2eefd27f033f 1 mykey } {myval} @@ -126,6 +135,10 @@ start_server {tags {"scripting"}} { r select 9 set res } {original value} + + test {EVAL background command} { + r eval {redis.call("SCAN", "0", "MATCH", "key*", "COUNT", 1000)} 0 + } {} if 0 { test {EVAL - Script can't run more than configured time limit} { diff --git a/tests/unit/soft_shutdown.tcl b/tests/unit/soft_shutdown.tcl new file mode 100644 index 000000000..48328d72f --- /dev/null +++ b/tests/unit/soft_shutdown.tcl @@ -0,0 +1,141 @@ +start_server {tags {"soft_shutdown"} } { + test {soft shutdown command replies} { + assert_equal [r shutdown soft] "OK" + } + + test {soft shutdown errors on ping} { + catch {[r ping]} e + assert_match {SHUTDOWN PENDING} $e + } +} + +start_server {tags {"soft_shutdown"} } { + test {soft shutdown prevents new connections} { + assert_equal [r shutdown soft] "OK" + # reconnect + set catch_res [catch {set rd [redis_deferring_client]} e] + if {$::tls} { + assert_equal $catch_res 1 + } else { + assert_match {*SHUTDOWN*} $e + } + } +} + +start_server {tags {"soft_shutdown"} } { + test {soft shutdown allows commands to execute while waiting} { + assert_equal [r shutdown soft] "OK" + r set test val + assert_equal [r get test] {val} + } +} + +start_server {tags {"soft_shutdown"} } { + test {soft shutdown shuts down after all clients exit} { + assert_equal [r shutdown soft] "OK" + r close + after 500 + catch {set rd [redis_deferring_client]} e + assert_match {*refused*} $e + } +} + +start_server {tags {"soft_shutdown"} overrides {soft-shutdown yes} } { + test {soft shutdown triggered by SIGINT} { + exec kill -SIGINT [s process_id] + after 100 + catch {[r ping]} e + assert_match {SHUTDOWN PENDING} $e + } + + test {second SIGINT forces a shutdown during a soft shutdown} { + exec kill -SIGINT [s process_id] + after 100 + catch {[r ping]} e + assert_match {*I/O*} $e + } +} + +start_server {tags {"soft_shutdown"} } { + test {monitor does not prevent soft shutdown} { + set monitor [redis_deferring_client] + $monitor monitor + assert_equal [r shutdown soft] "OK" + r close + after 500 + catch {set rd [redis_deferring_client]} e + assert_match {*refused*} $e + } +} + +start_server {tags {"soft_shutdown"} } { + start_server {} { + set node_0 [srv 0 client] + set node_0_host [srv 0 host] + set node_0_port [srv 0 port] + set node_0_pid [srv 0 pid] + + set node_1 [srv -1 client] + set node_1_host [srv -1 host] + set node_1_port [srv -1 port] + set node_1_pid [srv -1 pid] + + $node_0 replicaof $node_1_host $node_1_port + wait_for_sync $node_0 + + test {soft shutdown works for with master} { + $node_1 shutdown soft + } {OK} + + test {soft shutdown on master doesn't affect replica} { + assert_equal [$node_0 ping] {PONG} + } + + test {soft shutdown on master updates ping response} { + catch {$node_1 ping} e + assert_equal $e {SHUTDOWN PENDING} + } + + test {master prevents new connections with soft shutdown} { + set c1 [redis $node_1_host $node_1_port 1 $::tls] + set catch_res [catch {$c1 read} e] + if {$::tls} { + assert_equal $catch_res 1 + } else { + assert_match {*SHUTDOWN*} $e + } + } + + test {master soft shutdown works after all clients disconnect} { + $node_1 close + after 500 + catch {set c1 [redis $node_1_host $node_1_port 1 $::tls]} e + assert_match {*refused*} $e + } + } +} + +start_server {tags {"soft_shutdown"} } { + start_server {} { + set node_0 [srv 0 client] + set node_0_host [srv 0 host] + set node_0_port [srv 0 port] + set node_0_pid [srv 0 pid] + + set node_1 [srv -1 client] + set node_1_host [srv -1 host] + set node_1_port [srv -1 port] + set node_1_pid [srv -1 pid] + + $node_0 replicaof $node_1_host $node_1_port + wait_for_sync $node_0 + + test {soft shutdown on replica is not blocked by master} { + assert_equal [$node_0 shutdown soft] {OK} + $node_0 close + after 500 + catch {set c0 [redis $node_0_host $node_0_port 1 $::tls]} e + assert_match {*refused*} $e + } + } +} \ No newline at end of file diff --git a/tests/unit/sort.tcl b/tests/unit/sort.tcl index 083c4540d..2f4c4db38 100644 --- a/tests/unit/sort.tcl +++ b/tests/unit/sort.tcl @@ -315,4 +315,15 @@ start_server { } } } + + test {SETRANGE with huge offset} { + r lpush L 2 1 0 + # expecting a different outcome on 32 and 64 bit systems + foreach value {9223372036854775807 2147483647} { + catch {[r sort_ro L by a limit 2 $value]} res + if {![string match "2" $res] && ![string match "*out of range*" $res]} { + assert_not_equal $res "expecting an error or 2" + } + } + } } diff --git a/tests/unit/tls-auditlog.tcl b/tests/unit/tls-auditlog.tcl new file mode 100644 index 000000000..4761ada56 --- /dev/null +++ b/tests/unit/tls-auditlog.tcl @@ -0,0 +1,159 @@ +# only run this test if tls is enabled +if {$::tls} { + package require tls + + test {TLS Audit Log: Able to connect with no exclustion list} { + start_server {tags {"tls"}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with exclusion list '*'} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist *}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with matching CN} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist client.keydb.dev}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with matching SAN} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist san1.keydb.dev}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with matching CN with wildcard} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist client*.dev}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with matching SAN with wildcard} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist san*.dev}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect while with CN having a comprehensive list} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {dummy.keydb.dev client.keydb.dev other.keydb.dev}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS: Able to connect while with SAN having a comprehensive list} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {dummy.keydb.dev san2.keydb.dev other.keydb.dev}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect while with CN having a comprehensive list with wildcards} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {dummy.* client*.dev other.*}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit LogTLS: Able to connect while with SAN having a comprehensive list with wildcards} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {dummy.* san*.dev other.*}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Not matching CN or SAN accepted} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {client.keydb.dev}}} { + catch {r PING} + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to match against DNS SAN} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {san1.keydb.dev}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to match against email SAN} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {someone@keydb.dev}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to match against IPv4 SAN} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {192.168.0.1}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to match against IPv4 with a wildcard} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist {192.*}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to match against URI SAN} { + start_server {tags {"tls"} overrides {tls-allowlist {https://keydb.dev}}} { + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with matching CN} { + start_server {tags {"tls"} overrides {tls-auditlog-blocklist test.dev}} { + r set testkey foo + wait_for_condition 50 1000 { + [log_file_matches [srv 0 stdout] "*Audit Log: *, cmd set, keys: testkey*"] + } else { + fail "Missing expected Audit Log entry" + } + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with matching TLS allowlist and Audit Log blocklist} { + start_server {tags {"tls"} overrides {tls-allowlist client.keydb.dev tls-auditlog-blocklist client.keydb.dev}} { + r set testkey foo + if {[log_file_matches [srv 0 stdout] "*Audit Log: *, cmd set, keys: testkey*"]} { + fail "Unexpected Audit Log entry" + } + catch {r PING} e + assert_match {PONG} $e + } + } + + test {TLS Audit Log: Able to connect with different TLS allowlist and Audit Log blocklist} { + start_server {tags {"tls"} overrides {tls-allowlist client.keydb.dev tls-auditlog-blocklist test.dev}} { + r set testkey foo + wait_for_condition 50 1000 { + [log_file_matches [srv 0 stdout] "*Audit Log: *, cmd set, keys: testkey*"] + } else { + fail "Missing expected Audit Log entry" + } + catch {r PING} e + assert_match {PONG} $e + } + } + +} else { + start_server {} { + # just a dummy server so that the test doesn't panic if tls is disabled + # otherwise the test will try to bind to a server that just isn't there + } +} diff --git a/tests/unit/type/hash.tcl b/tests/unit/type/hash.tcl index e95fd9fce..916ebd534 100644 --- a/tests/unit/type/hash.tcl +++ b/tests/unit/type/hash.tcl @@ -68,6 +68,13 @@ start_server {tags {"hash"}} { r hrandfield myhash 0 } {} + test "HRANDFIELD count overflow" { + r hmset myhash a 1 + assert_error {*value is out of range*} {r hrandfield myhash -9223372036854770000 withvalues} + assert_error {*value is out of range*} {r hrandfield myhash -9223372036854775808 withvalues} + assert_error {*value is out of range*} {r hrandfield myhash -9223372036854775808} + } {} + test "HRANDFIELD with against non existing key" { r hrandfield nonexisting_key 100 } {} @@ -826,4 +833,8 @@ start_server {tags {"hash"}} { set _ $k } {ZIP_INT_8B 127 ZIP_INT_16B 32767 ZIP_INT_32B 2147483647 ZIP_INT_64B 9223372036854775808 ZIP_INT_IMM_MIN 0 ZIP_INT_IMM_MAX 12} + test {HINCRBYFLOAT does not allow NaN or Infinity} { + assert_error "*value is NaN or Infinity*" {r hincrbyfloat hfoo field +inf} + assert_equal 0 [r exists hfoo] + } } diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl index 36020c125..69c519a8e 100644 --- a/tests/unit/type/list.tcl +++ b/tests/unit/type/list.tcl @@ -1069,6 +1069,13 @@ start_server { r ping } {PONG} + test "Regression for bug 659 - chaining BRPOP with async blocking cmds" { + r flushall + r brpop foo 1 + r keys * + r ping + } {PONG} + test "client unblock tests" { r del l set rd [redis_deferring_client] diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index ee7b936b5..a24b4c601 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -588,6 +588,11 @@ start_server { r srandmember nonexisting_key 100 } {} + test "SRANDMEMBER count overflow" { + r sadd myset a + assert_error {*value is out of range*} {r srandmember myset -9223372036854775808} + } {} + # Make sure we can distinguish between an empty array and a null response r readraw 1 diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index c299a1e97..2990f47ef 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -172,6 +172,15 @@ start_server { assert_equal [r XRANGE mystream - +] {{3-0 {f v}} {4-0 {f v}} {5-0 {f v}}} } + test {XTRIM with MINID option, big delta from master record} { + r DEL mystream + r XADD mystream 1-0 f v + r XADD mystream 1641544570597-0 f v + r XADD mystream 1641544570597-1 f v + r XTRIM mystream MINID 1641544570597-0 + assert_equal [r XRANGE mystream - +] {{1641544570597-0 {f v}} {1641544570597-1 {f v}}} + } + test {XADD mass insertion and XLEN} { r DEL mystream r multi diff --git a/tests/unit/type/string.tcl b/tests/unit/type/string.tcl index 43968b26b..0c957906d 100644 --- a/tests/unit/type/string.tcl +++ b/tests/unit/type/string.tcl @@ -574,4 +574,14 @@ start_server {tags {"string"}} { test {LCS indexes with match len and minimum match len} { dict get [r STRALGO LCS IDX KEYS virus1 virus2 WITHMATCHLEN MINMATCHLEN 5] matches } {{{1 222} {13 234} 222}} + + test {SETRANGE with huge offset} { + foreach value {9223372036854775807 2147483647} { + catch {[r setrange K $value A]} res + # expecting a different error on 32 and 64 bit systems + if {![string match "*string exceeds maximum allowed size*" $res] && ![string match "*out of range*" $res]} { + assert_equal $res "expecting an error" + } + } + } } diff --git a/tests/unit/type/zset.tcl b/tests/unit/type/zset.tcl index 94b2ab480..02184fc8c 100644 --- a/tests/unit/type/zset.tcl +++ b/tests/unit/type/zset.tcl @@ -1714,6 +1714,13 @@ start_server {tags {"zset"}} { r zrandmember nonexisting_key 100 } {} + test "ZRANDMEMBER count overflow" { + r zadd myzset 0 a + assert_error {*value is out of range*} {r zrandmember myzset -9223372036854770000 withscores} + assert_error {*value is out of range*} {r zrandmember myzset -9223372036854775808 withscores} + assert_error {*value is out of range*} {r zrandmember myzset -9223372036854775808} + } {} + # Make sure we can distinguish between an empty array and a null response r readraw 1 diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster index 4733f0ccb..c8df34044 100755 --- a/utils/create-cluster/create-cluster +++ b/utils/create-cluster/create-cluster @@ -26,7 +26,11 @@ then while [ $((PORT < ENDPORT)) != "0" ]; do PORT=$((PORT+1)) echo "Starting $PORT" - $BIN_PATH/keydb-server --port $PORT --protected-mode $PROTECTED_MODE --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout $TIMEOUT --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes ${ADDITIONAL_OPTIONS} + if [ "$2" == "flash" ] + then + ADDITIONAL_OPTIONS="--save \"\" \"\" \"\" --semi-ordered-set-bucket-size 8 --client-output-buffer-limit replica 1 1 0 --maxmemory 100000000 --storage-provider flash ./$PORT.flash" + fi + $BIN_PATH/keydb-server --server-threads 4 --port $PORT --protected-mode $PROTECTED_MODE --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout $TIMEOUT --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes ${ADDITIONAL_OPTIONS} done exit 0 fi @@ -97,6 +101,8 @@ then rm -rf appendonly*.aof rm -rf dump*.rdb rm -rf nodes*.conf + rm -rf *.flash + rm -rf temp*.rdb exit 0 fi diff --git a/utils/redis-copy.rb b/utils/redis-copy.rb index aa9e797b5..c32c0c022 100644 --- a/utils/redis-copy.rb +++ b/utils/redis-copy.rb @@ -1,27 +1,27 @@ -# keydb-copy.rb - Copyright (C) 2009-2010 Salvatore Sanfilippo +# redis-copy.rb - Copyright (C) 2009-2010 Salvatore Sanfilippo # BSD license, See the COPYING file for more information. # -# Copy the whole dataset from one Redis instance to another one +# Copy the whole dataset from one KeyDB/Redis instance to another one # # WARNING: this utility is deprecated and serves as a legacy adapter -# for the more-robust keydb-copy gem. +# for the more-robust redis-copy gem. require 'shellwords' def redisCopy(opts={}) src = "#{opts[:srchost]}:#{opts[:srcport]}" dst = "#{opts[:dsthost]}:#{opts[:dstport]}" - `keydb-copy #{src.shellescape} #{dst.shellescape}` + `redis-copy #{src.shellescape} #{dst.shellescape}` rescue Errno::ENOENT - $stderr.puts 'This utility requires the keydb-copy executable', - 'from the keydb-copy gem on https://rubygems.org', - 'To install it, run `gem install keydb-copy`.' + $stderr.puts 'This utility requires the redis-copy executable', + 'from the redis-copy gem on https://rubygems.org', + 'To install it, run `gem install redis-copy`.' exit 1 end -$stderr.puts "This utility is deprecated. Use the keydb-copy gem instead." +$stderr.puts "This utility is deprecated. Use the redis-copy gem instead." if ARGV.length != 4 - puts "Usage: keydb-copy.rb " + puts "Usage: redis-copy.rb " exit 1 end puts "WARNING: it's up to you to FLUSHDB the destination host before to continue, press any key when ready." diff --git a/vcpkg.json b/vcpkg.json new file mode 100644 index 000000000..fac9b70cc --- /dev/null +++ b/vcpkg.json @@ -0,0 +1,10 @@ +{ + "name": "keydb", + "version-string": "6.3.0", + "port-version": 1, + "homepage": "https://github.com/Snapchat/KeyDB", + "description": "KeyDB is a high performance fork of Redis with a focus on multithreading, memory efficiency, and high throughput. In addition to performance improvements, KeyDB offers features such as Active Replication, FLASH Storage and Subkey Expires. KeyDB has a MVCC architecture that allows you to execute queries such as KEYS and SCAN without blocking the database and degrading performance.", + "dependencies": [ + "jemalloc", "rocksdb" + ] +} \ No newline at end of file