diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 98ab11753a..60b1b46f1e 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -53,7 +53,7 @@ format: _help_require_valid_layout: - By default, if cmake-format cannot successfully fit - everything into the desired linewidth it will apply the - - last, most agressive attempt that it made. If this flag is + - last, most aggressive attempt that it made. If this flag is - True, however, cmake-format will print error, exit with non- - zero status code, and write-out nothing require_valid_layout: false diff --git a/.config/typos.toml b/.config/typos.toml index 1dc44ea0e9..a8ecab921c 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -2,16 +2,16 @@ [files] extend-exclude = [ + ".git/", "deps/", # crc16_slottable is primarily pre-generated random strings. "src/crc16_slottable.h", ] +ignore-hidden = false [default.extend-words] -advices = "advices" exat = "exat" optin = "optin" -ro = "ro" smove = "smove" [type.c] @@ -20,7 +20,7 @@ extend-ignore-re = [ "D4C4DAA4", # sha1.c "Georg Nees", "\\[l\\]ist", # eval.c - "LKE", # test_rax.c + '"LKE"', # test_rax.c ] [type.tcl] @@ -28,26 +28,23 @@ extend-ignore-re = [ "DUMPed", ] -[type.sv.extend-identifiers] -# sv = .h -module_gil_acquring = "module_gil_acquring" - [type.c.extend-identifiers] -ang = "ang" +advices = "advices" clen = "clen" fle = "fle" -module_gil_acquring = "module_gil_acquring" nd = "nd" ot = "ot" [type.tcl.extend-identifiers] -fo = "fo" oll = "oll" stressers = "stressers" -[type.sv.extend-words] +[type.sv.extend-identifiers] # sv = .h fo = "fo" + +[type.sv.extend-words] +# sv = .h seeked = "seeked" [type.c.extend-words] @@ -58,7 +55,6 @@ limite = "limite" pn = "pn" seeked = "seeked" tre = "tre" -ws = "ws" [type.systemd.extend-words] # systemd = .conf @@ -66,5 +62,4 @@ ake = "ake" [type.tcl.extend-words] fo = "fo" -lst = "lst" tre = "tre" diff --git a/.github/actions/generate-package-build-matrix/action.yml b/.github/actions/generate-package-build-matrix/action.yml index 7e90f27be5..2494a71118 100644 --- a/.github/actions/generate-package-build-matrix/action.yml +++ b/.github/actions/generate-package-build-matrix/action.yml @@ -24,11 +24,11 @@ runs: - name: Get targets run: | - x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' utils/releasetools/build-config.json) + x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' .github/actions/generate-package-build-matrix/build-config.json) x86_matrix=$(echo "{ \"distro\" : $x86_arch }" | jq -c .) echo "X86_MATRIX=$x86_matrix" >> $GITHUB_ENV - arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' utils/releasetools/build-config.json) + arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' .github/actions/generate-package-build-matrix/build-config.json) arm_matrix=$(echo "{ \"distro\" : $arm_arch }" | jq -c .) echo "ARM_MATRIX=$arm_matrix" >> $GITHUB_ENV shell: bash diff --git a/utils/releasetools/build-config.json b/.github/actions/generate-package-build-matrix/build-config.json similarity index 60% rename from utils/releasetools/build-config.json rename to .github/actions/generate-package-build-matrix/build-config.json index 5e39fae70f..f64bf601ca 100644 --- a/utils/releasetools/build-config.json +++ b/.github/actions/generate-package-build-matrix/build-config.json @@ -1,29 +1,24 @@ { "linux_targets": [ + { "arch": "x86_64", - "target": "ubuntu18.04", + "target": "ubuntu-20.04", "type": "deb", - "platform": "bionic" + "platform": "focal" }, { "arch": "x86_64", - "target": "ubuntu20.04", + "target": "ubuntu-22.04", "type": "deb", - "platform": "focal" + "platform": "jammy" }, { "arch": "x86_64", - "target": "ubuntu24.04", + "target": "ubuntu-24.04", "type": "deb", "platform": "noble" }, - { - "arch": "arm64", - "target": "ubuntu18.04", - "type": "deb", - "platform": "bionic" - }, { "arch": "arm64", "target": "ubuntu20.04", @@ -32,9 +27,9 @@ }, { "arch": "arm64", - "target": "ubuntu24.04", + "target": "ubuntu22.04", "type": "deb", - "platform": "noble" + "platform": "jammy" } ] } \ No newline at end of file diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml index 094d82de08..d7ab8e57d6 100644 --- a/.github/workflows/build-release-packages.yml +++ b/.github/workflows/build-release-packages.yml @@ -3,7 +3,12 @@ name: Build Release Packages on: release: types: [published] - + push: + paths: + - '.github/workflows/build-release-packages.yml' + - '.github/workflows/call-build-linux-arm-packages.yml' + - '.github/workflows/call-build-linux-x86-packages.yml' + - '.github/actions/generate-package-build-matrix/build-config.json' workflow_dispatch: inputs: version: @@ -11,17 +16,19 @@ on: required: true permissions: + id-token: write contents: read jobs: # This job provides the version metadata from the tag for the other jobs to use. release-build-get-meta: name: Get metadata to build + if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: version: ${{ steps.get_version.outputs.VERSION }} + is_test: ${{ steps.check-if-testing.outputs.IS_TEST }} steps: - - run: | echo "Version: ${{ inputs.version || github.ref_name }}" shell: bash @@ -32,8 +39,13 @@ jobs: - name: Get the version id: get_version run: | - VERSION="${INPUT_VERSION}" + if [[ "${{ github.event_name }}" == "push" ]]; then + VERSION=${{ github.ref_name }} + else + VERSION="${INPUT_VERSION}" + fi if [ -z "${VERSION}" ]; then + echo "Error: No version specified" exit 1 fi echo "VERSION=$VERSION" >> $GITHUB_OUTPUT @@ -43,8 +55,21 @@ jobs: # only ever be a tag INPUT_VERSION: ${{ inputs.version || github.ref_name }} + - name: Check if we are testing + id: check-if-testing + run: | + if [[ "${{ github.event_name }}" == "push" ]]; then + echo "This is a test workflow -> We will upload to the Test S3 Bucket" + echo "IS_TEST=true" >> $GITHUB_OUTPUT + else + echo "This is a Release workflow -> We will upload to the Release S3 Bucket" + echo "IS_TEST=false" >> $GITHUB_OUTPUT + fi + shell: bash + generate-build-matrix: name: Generating build matrix + if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }} @@ -56,7 +81,7 @@ jobs: - uses: ./.github/actions/generate-package-build-matrix id: set-matrix with: - ref: ${{ inputs.version || github.ref_name }} + ref: ${{ needs.release-build-get-meta.outputs.version }} release-build-linux-x86-packages: needs: @@ -67,11 +92,10 @@ jobs: version: ${{ needs.release-build-get-meta.outputs.version }} ref: ${{ inputs.version || github.ref_name }} build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }} + region: us-west-2 secrets: - token: ${{ secrets.GITHUB_TOKEN }} - bucket: ${{ secrets.AWS_S3_BUCKET }} - access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }} - secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} release-build-linux-arm-packages: needs: @@ -82,8 +106,7 @@ jobs: version: ${{ needs.release-build-get-meta.outputs.version }} ref: ${{ inputs.version || github.ref_name }} build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }} + region: us-west-2 secrets: - token: ${{ secrets.GITHUB_TOKEN }} - bucket: ${{ secrets.AWS_S3_BUCKET }} - access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }} - secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} diff --git a/.github/workflows/call-build-linux-arm-packages.yml b/.github/workflows/call-build-linux-arm-packages.yml index 2a7bcc533f..65445a83c8 100644 --- a/.github/workflows/call-build-linux-arm-packages.yml +++ b/.github/workflows/call-build-linux-arm-packages.yml @@ -15,21 +15,20 @@ on: description: The build targets to produce as a JSON matrix. type: string required: true + region: + description: The AWS region to push packages into. + type: string + required: true secrets: - token: - description: The Github token or similar to authenticate with. + bucket_name: + description: The S3 bucket to push packages into. + required: true + role_to_assume: + description: The role to assume for the S3 bucket. required: true - bucket: - description: The name of the S3 bucket to push packages into. - required: false - access_key_id: - description: The S3 access key id for the bucket. - required: false - secret_access_key: - description: The S3 secret access key for the bucket. - required: false permissions: + id-token: write contents: read jobs: @@ -46,6 +45,12 @@ jobs: with: ref: ${{ inputs.version }} + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ inputs.region }} + role-to-assume: ${{ secrets.role_to_assume }} + - name: Make Valkey uses: uraimo/run-on-arch-action@v2 with: @@ -65,15 +70,5 @@ jobs: mkdir -p packages-files cp -rfv $TAR_FILE_NAME.tar* packages-files/ - - name: Install AWS cli. - run: | - sudo apt-get install -y awscli - - - name: Configure AWS credentials - run: | - aws configure set region us-west-2 - aws configure set aws_access_key_id ${{ secrets.access_key_id }} - aws configure set aws_secret_access_key ${{ secrets.secret_access_key }} - - name: Sync to S3 - run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/ + run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/ diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml index 9e438fa61a..4e68bf85f0 100644 --- a/.github/workflows/call-build-linux-x86-packages.yml +++ b/.github/workflows/call-build-linux-x86-packages.yml @@ -15,28 +15,27 @@ on: description: The build targets to produce as a JSON matrix. type: string required: true + region: + description: The AWS region to upload the packages to. + type: string + required: true secrets: - token: - description: The Github token or similar to authenticate with. + bucket_name: + description: The name of the S3 bucket to upload the packages to. + required: true + role_to_assume: + description: The role to assume for the S3 bucket. required: true - bucket: - description: The name of the S3 bucket to push packages into. - required: false - access_key_id: - description: The S3 access key id for the bucket. - required: false - secret_access_key: - description: The S3 secret access key for the bucket. - required: false permissions: + id-token: write contents: read jobs: build-valkey: # Capture source tarball and generate checksum for it name: Build package ${{ matrix.distro.target }} ${{ matrix.distro.arch }} - runs-on: "ubuntu-latest" + runs-on: ${{matrix.distro.target}} strategy: fail-fast: false matrix: ${{ fromJSON(inputs.build_matrix) }} @@ -46,6 +45,12 @@ jobs: with: ref: ${{ inputs.version }} + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ inputs.region }} + role-to-assume: ${{ secrets.role_to_assume }} + - name: Install dependencies run: sudo apt-get update && sudo apt-get install -y build-essential libssl-dev libsystemd-dev @@ -63,15 +68,5 @@ jobs: mkdir -p packages-files cp -rfv $TAR_FILE_NAME.tar* packages-files/ - - name: Install AWS cli. - run: | - sudo apt-get install -y awscli - - - name: Configure AWS credentials - run: | - aws configure set region us-west-2 - aws configure set aws_access_key_id ${{ secrets.access_key_id }} - aws configure set aws_secret_access_key ${{ secrets.secret_access_key }} - - name: Sync to S3 - run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/ + run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index df3eaa1905..a1014faa99 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,10 +18,15 @@ jobs: # Fail build if there are warnings # build with TLS just for compilation coverage run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes USE_FAST_FLOAT=yes + - name: install old server for compatibility testing + run: | + cd tests/tmp + wget https://download.valkey.io/releases/valkey-7.2.7-noble-x86_64.tar.gz + tar -xvf valkey-7.2.7-noble-x86_64.tar.gz - name: test run: | sudo apt-get install tcl8.6 tclx - ./runtest --verbose --tags -slow --dump-logs + ./runtest --verbose --tags -slow --dump-logs --other-server-path tests/tmp/valkey-7.2.7-noble-x86_64/bin/valkey-server - name: module api test run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs - name: validate commands.def up to date diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index efc63a1f6f..ab4f7a040d 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -1,6 +1,7 @@ name: Clang Format Check on: + push: pull_request: paths: - 'src/**' diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 951b5c2862..cd1f1b20a7 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -10,7 +10,7 @@ concurrency: jobs: code-coverage: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index c06d73440d..309e8353f7 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -44,7 +44,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'ubuntu') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -87,7 +87,7 @@ jobs: (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'fortify') container: ubuntu:plucky - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -132,7 +132,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'malloc') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -171,7 +171,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'malloc') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -210,7 +210,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, '32bit') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -256,7 +256,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'tls') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -302,7 +302,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'tls') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -348,7 +348,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'iothreads') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -375,6 +375,44 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}} + test-ubuntu-tls-io-threads: + runs-on: ubuntu-latest + if: | + (github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || + (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && + !contains(github.event.inputs.skipjobs, 'tls') && !contains(github.event.inputs.skipjobs, 'iothreads') + timeout-minutes: 1440 + steps: + - name: prep + if: github.event_name == 'workflow_dispatch' + run: | + echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV + echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV + echo "skipjobs: ${{github.event.inputs.skipjobs}}" + echo "skiptests: ${{github.event.inputs.skiptests}}" + echo "test_args: ${{github.event.inputs.test_args}}" + echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: ${{ env.GITHUB_REPOSITORY }} + ref: ${{ env.GITHUB_HEAD_REF }} + - name: make + run: | + make BUILD_TLS=yes SERVER_CFLAGS='-Werror' + - name: testprep + run: | + sudo apt-get install tcl8.6 tclx tcl-tls + ./utils/gen-test-certs.sh + - name: test + if: true && !contains(github.event.inputs.skiptests, 'valkey') + run: | + ./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}} + - name: cluster tests + if: true && !contains(github.event.inputs.skiptests, 'cluster') + run: | + ./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}} + test-ubuntu-reclaim-cache: runs-on: ubuntu-latest if: | @@ -382,7 +420,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'specific') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -458,7 +496,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !contains(github.event.inputs.skiptests, 'valkey') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -490,7 +528,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !(contains(github.event.inputs.skiptests, 'modules') && contains(github.event.inputs.skiptests, 'unittest')) - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -527,7 +565,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !contains(github.event.inputs.skiptests, 'valkey') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -559,7 +597,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !(contains(github.event.inputs.skiptests, 'modules') && contains(github.event.inputs.skiptests, 'unittest')) - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -596,7 +634,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'sanitizer') - timeout-minutes: 14400 + timeout-minutes: 1440 strategy: fail-fast: false matrix: @@ -646,7 +684,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'sanitizer') - timeout-minutes: 14400 + timeout-minutes: 1440 strategy: fail-fast: false matrix: @@ -689,6 +727,52 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate + test-sanitizer-force-defrag: + runs-on: ubuntu-latest + if: | + (github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || + (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && + !contains(github.event.inputs.skipjobs, 'sanitizer') + timeout-minutes: 1440 + strategy: + fail-fast: false + steps: + - name: prep + if: github.event_name == 'workflow_dispatch' + run: | + echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV + echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV + echo "skipjobs: ${{github.event.inputs.skipjobs}}" + echo "skiptests: ${{github.event.inputs.skiptests}}" + echo "test_args: ${{github.event.inputs.test_args}}" + echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: ${{ env.GITHUB_REPOSITORY }} + ref: ${{ env.GITHUB_HEAD_REF }} + - name: make + run: make all-with-unit-tests OPT=-O3 SANITIZER=address DEBUG_FORCE_DEFRAG=yes USE_JEMALLOC=no SERVER_CFLAGS='-Werror' + - name: testprep + run: | + sudo apt-get update + sudo apt-get install tcl8.6 tclx -y + - name: test + if: true && !contains(github.event.inputs.skiptests, 'valkey') + run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} + - name: module api test + if: true && !contains(github.event.inputs.skiptests, 'modules') + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + - name: sentinel tests + if: true && !contains(github.event.inputs.skiptests, 'sentinel') + run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} + - name: cluster tests + if: true && !contains(github.event.inputs.skiptests, 'cluster') + run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} + - name: unittest + if: true && !contains(github.event.inputs.skiptests, 'unittest') + run: ./src/valkey-unit-tests + test-rpm-distros-jemalloc: if: | (github.event_name == 'workflow_dispatch' || @@ -717,7 +801,7 @@ jobs: runs-on: ubuntu-latest container: ${{ matrix.container }} - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep @@ -783,7 +867,7 @@ jobs: runs-on: ubuntu-latest container: ${{ matrix.container }} - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep @@ -855,7 +939,7 @@ jobs: runs-on: ubuntu-latest container: ${{ matrix.container }} - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep @@ -906,7 +990,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') && !(contains(github.event.inputs.skiptests, 'valkey') && contains(github.event.inputs.skiptests, 'modules')) - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -937,7 +1021,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') && !contains(github.event.inputs.skiptests, 'sentinel') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -965,7 +1049,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') && !contains(github.event.inputs.skiptests, 'cluster') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -997,7 +1081,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0 with: @@ -1025,7 +1109,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'freebsd') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -1132,11 +1216,11 @@ jobs: reply-schemas-validator: runs-on: ubuntu-latest - timeout-minutes: 14400 + timeout-minutes: 1440 if: | (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || - (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && + (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'reply-schema') steps: - name: prep diff --git a/.github/workflows/external.yml b/.github/workflows/external.yml index cfcab995d5..a9777538fd 100644 --- a/.github/workflows/external.yml +++ b/.github/workflows/external.yml @@ -17,7 +17,7 @@ jobs: test-external-standalone: runs-on: ubuntu-latest if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey' - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Build @@ -34,7 +34,7 @@ jobs: --tags -slow - name: Archive server log if: ${{ failure() }} - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-external-standalone-log path: external-server.log @@ -42,7 +42,7 @@ jobs: test-external-cluster: runs-on: ubuntu-latest if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey' - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Build @@ -62,7 +62,7 @@ jobs: --tags -slow - name: Archive server log if: ${{ failure() }} - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-external-cluster-log path: external-server.log @@ -70,7 +70,7 @@ jobs: test-external-nodebug: runs-on: ubuntu-latest if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey' - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Build @@ -86,7 +86,7 @@ jobs: --tags "-slow -needs:debug" - name: Archive server log if: ${{ failure() }} - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-external-nodebug-log path: external-server.log diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml index 69d9b9cb6a..14db670b24 100644 --- a/.github/workflows/spell-check.yml +++ b/.github/workflows/spell-check.yml @@ -26,7 +26,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Install typos - uses: taiki-e/install-action@cd5df4de2e75f3b819ba55f780f7bb8cd4a05a41 # v2.32.2 + uses: taiki-e/install-action@fe9759bf4432218c779595708e80a1aadc85cedc # v2.46.10 with: tool: typos diff --git a/.gitignore b/.gitignore index d5cac316e6..d85087c459 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ nodes*.conf tests/cluster/tmp/* tests/rdma/rdma-test tags +build/ build-debug/ build-release/ cmake-build-debug/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 77d0c4e7d8..55b18cb994 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,3 +41,4 @@ unset(BUILD_UNIT_TESTS CACHE) unset(BUILD_TEST_MODULES CACHE) unset(BUILD_EXAMPLE_MODULES CACHE) unset(USE_TLS CACHE) +unset(DEBUG_FORCE_DEFRAG CACHE) diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index 1a754ff846..e51f9b7600 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -47,7 +47,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/cluster_slot_stats.c ${CMAKE_SOURCE_DIR}/src/crc16.c ${CMAKE_SOURCE_DIR}/src/endianconv.c - ${CMAKE_SOURCE_DIR}/src/slowlog.c + ${CMAKE_SOURCE_DIR}/src/commandlog.c ${CMAKE_SOURCE_DIR}/src/eval.c ${CMAKE_SOURCE_DIR}/src/bio.c ${CMAKE_SOURCE_DIR}/src/rio.c @@ -100,6 +100,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/script_lua.c ${CMAKE_SOURCE_DIR}/src/script.c ${CMAKE_SOURCE_DIR}/src/functions.c + ${CMAKE_SOURCE_DIR}/src/scripting_engine.c ${CMAKE_SOURCE_DIR}/src/function_lua.c ${CMAKE_SOURCE_DIR}/src/commands.c ${CMAKE_SOURCE_DIR}/src/strl.c diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake index 8a4d4da1c9..77360844fc 100644 --- a/cmake/Modules/ValkeySetup.cmake +++ b/cmake/Modules/ValkeySetup.cmake @@ -93,6 +93,9 @@ macro (valkey_build_and_install_bin target sources ld_flags libs link_name) target_link_libraries(${target} execinfo) endif () + # Enable all warnings + fail on warning + target_compile_options(${target} PRIVATE -Werror -Wall) + # Install cli tool and create a redis symbolic link valkey_install_bin(${target}) valkey_create_symlink(${target} ${link_name}) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt index c904b94031..3f5b04dc22 100644 --- a/deps/CMakeLists.txt +++ b/deps/CMakeLists.txt @@ -1,4 +1,6 @@ -add_subdirectory(jemalloc) +if (USE_JEMALLOC) + add_subdirectory(jemalloc) +endif () add_subdirectory(lua) # Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b87dff3db0..90d7e25cf4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,6 +22,12 @@ if (VALKEY_RELEASE_BUILD) set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) endif () +if (DEBUG_FORCE_DEFRAG) + message(STATUS "Forcing Active Defrag run on valkey-server") + target_compile_definitions(valkey-server PRIVATE DEBUG_FORCE_DEFRAG) + target_compile_definitions(valkey-server PRIVATE HAVE_DEFRAG) +endif () + if (BUILD_SANITIZER) # 'BUILD_SANITIZER' is defined in ValkeySetup module (based on user input) # If defined, the variables 'VALKEY_SANITAIZER_CFLAGS' and 'VALKEY_SANITAIZER_LDFLAGS' diff --git a/src/Makefile b/src/Makefile index 8552deb3d9..7a951193e4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -130,6 +130,11 @@ ifdef REDIS_LDFLAGS SERVER_LDFLAGS := $(REDIS_LDFLAGS) endif +# Special case of forcing defrag to run even though we have no Jemlloc support +ifeq ($(DEBUG_FORCE_DEFRAG), yes) + SERVER_CFLAGS +=-DHAVE_DEFRAG -DDEBUG_FORCE_DEFRAG +endif + FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS) FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm @@ -369,7 +374,7 @@ else endef endif -# Determine install/uninstall Redis symlinks for compatibility when +# Determine install/uninstall Redis symlinks for compatibility when # installing/uninstalling Valkey binaries (defaulting to `yes`) USE_REDIS_SYMLINKS?=yes ifeq ($(USE_REDIS_SYMLINKS),yes) @@ -411,7 +416,7 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o commandlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o scripting_engine.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) diff --git a/src/acl.c b/src/acl.c index cfcf102887..807ef744d2 100644 --- a/src/acl.c +++ b/src/acl.c @@ -29,6 +29,7 @@ #include "server.h" #include "sha256.h" +#include "module.h" #include #include @@ -297,11 +298,6 @@ int ACLListMatchSds(void *a, void *b) { return sdscmp(a, b) == 0; } -/* Method to free list elements from ACL users password/patterns lists. */ -void ACLListFreeSds(void *item) { - sdsfree(item); -} - /* Method to duplicate list elements from ACL users password/patterns lists. */ void *ACLListDupSds(void *item) { return sdsdup(item); @@ -374,7 +370,7 @@ aclSelector *ACLCreateSelector(int flags) { listSetFreeMethod(selector->patterns, ACLListFreeKeyPattern); listSetDupMethod(selector->patterns, ACLListDupKeyPattern); listSetMatchMethod(selector->channels, ACLListMatchSds); - listSetFreeMethod(selector->channels, ACLListFreeSds); + listSetFreeMethod(selector->channels, sdsfreeVoid); listSetDupMethod(selector->channels, ACLListDupSds); memset(selector->allowed_commands, 0, sizeof(selector->allowed_commands)); @@ -445,7 +441,7 @@ user *ACLCreateUser(const char *name, size_t namelen) { u->passwords = listCreate(); u->acl_string = NULL; listSetMatchMethod(u->passwords, ACLListMatchSds); - listSetFreeMethod(u->passwords, ACLListFreeSds); + listSetFreeMethod(u->passwords, sdsfreeVoid); listSetDupMethod(u->passwords, ACLListDupSds); u->selectors = listCreate(); @@ -489,6 +485,11 @@ void ACLFreeUser(user *u) { zfree(u); } +/* Used for generic free functions. */ +static void ACLFreeUserVoid(void *u) { + ACLFreeUser(u); +} + /* When a user is deleted we need to cycle the active * connections in order to kill all the pending ones that * are authenticated with such user. */ @@ -654,7 +655,7 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int ACLResetFirstArgsForCommand(selector, id); if (cmd->subcommands_ht) { hashtableIterator iter; - hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *sub = next; @@ -672,7 +673,7 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int * found and the operation was performed. */ void ACLSetSelectorCommandBitsForCategory(hashtable *commands, aclSelector *selector, uint64_t cflag, int value) { hashtableIterator iter; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; @@ -740,7 +741,7 @@ void ACLCountCategoryBitsForCommands(hashtable *commands, unsigned long *off, uint64_t cflag) { hashtableIterator iter; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; @@ -1077,19 +1078,24 @@ int ACLSetSelector(aclSelector *selector, const char *op, size_t oplen) { int flags = 0; size_t offset = 1; if (op[0] == '%') { + int perm_ok = 1; for (; offset < oplen; offset++) { if (toupper(op[offset]) == 'R' && !(flags & ACL_READ_PERMISSION)) { flags |= ACL_READ_PERMISSION; } else if (toupper(op[offset]) == 'W' && !(flags & ACL_WRITE_PERMISSION)) { flags |= ACL_WRITE_PERMISSION; - } else if (op[offset] == '~' && flags) { + } else if (op[offset] == '~') { offset++; break; } else { - errno = EINVAL; - return C_ERR; + perm_ok = 0; + break; } } + if (!flags || !perm_ok) { + errno = EINVAL; + return C_ERR; + } } else { flags = ACL_ALL_PERMISSION; } @@ -1954,7 +1960,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) { if (getClientType(c) == CLIENT_TYPE_PUBSUB) { /* Check for pattern violations. */ - dictIterator *di = dictGetIterator(c->pubsub_patterns); + dictIterator *di = dictGetIterator(c->pubsub_data->pubsub_patterns); dictEntry *de; while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); @@ -1966,7 +1972,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) { /* Check for channel violations. */ if (!kill) { /* Check for global channels violation. */ - di = dictGetIterator(c->pubsub_channels); + di = dictGetIterator(c->pubsub_data->pubsub_channels); while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); @@ -1977,7 +1983,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) { } if (!kill) { /* Check for shard channels violation. */ - di = dictGetIterator(c->pubsubshard_channels); + di = dictGetIterator(c->pubsub_data->pubsubshard_channels); while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); @@ -2445,12 +2451,12 @@ sds ACLLoadFromFile(const char *filename) { c->user = new_user; } - if (user_channels) raxFreeWithCallback(user_channels, (void (*)(void *))listRelease); - raxFreeWithCallback(old_users, (void (*)(void *))ACLFreeUser); + if (user_channels) raxFreeWithCallback(user_channels, listReleaseVoid); + raxFreeWithCallback(old_users, ACLFreeUserVoid); sdsfree(errors); return NULL; } else { - raxFreeWithCallback(Users, (void (*)(void *))ACLFreeUser); + raxFreeWithCallback(Users, ACLFreeUserVoid); Users = old_users; errors = sdscat(errors, "WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); @@ -2759,7 +2765,7 @@ sds getAclErrorMessage(int acl_res, user *user, struct serverCommand *cmd, sds e /* ACL CAT category */ void aclCatWithFlags(client *c, hashtable *commands, uint64_t cflag, int *arraylen) { hashtableIterator iter; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; diff --git a/src/adlist.c b/src/adlist.c index 11b152592b..0dc77cc038 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -77,6 +77,12 @@ void listRelease(list *list) { zfree(list); } +/* Just like listRelease, but takes the list as a (void *). + * Useful as generic free callback. */ +void listReleaseVoid(void *l) { + listRelease((list *)l); +} + /* Add a new node to the list, to head, containing the specified 'value' * pointer as value. * diff --git a/src/adlist.h b/src/adlist.h index bfc4280434..c642c1c791 100644 --- a/src/adlist.h +++ b/src/adlist.h @@ -72,6 +72,7 @@ typedef struct list { /* Prototypes */ list *listCreate(void); void listRelease(list *list); +void listReleaseVoid(void *list); void listEmpty(list *list); list *listAddNodeHead(list *list, void *value); list *listAddNodeTail(list *list, void *value); diff --git a/src/allocator_defrag.c b/src/allocator_defrag.c index b2330c95e0..5e805b3044 100644 --- a/src/allocator_defrag.c +++ b/src/allocator_defrag.c @@ -43,12 +43,10 @@ * the other component to ensure both are using the same allocator configuration. */ -#include +#include "server.h" #include "serverassert.h" #include "allocator_defrag.h" -#define UNUSED(x) (void)(x) - #if defined(HAVE_DEFRAG) && defined(USE_JEMALLOC) #define STRINGIFY_(x) #x @@ -402,8 +400,56 @@ int allocatorShouldDefrag(void *ptr) { je_cb.bin_info[binind].nregs - SLAB_NFREE(out, 0)); } -#else +/* Utility function to get the fragmentation ratio from jemalloc. + * It is critical to do that by comparing only heap maps that belong to + * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this + * fragmentation ratio in order to decide if a defrag action should be taken + * or not, a false detection can cause the defragmenter to waste a lot of CPU + * without the possibility of getting any results. */ +float getAllocatorFragmentation(size_t *out_frag_bytes) { + size_t resident, active, allocated, frag_smallbins_bytes; + zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); + frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); + /* Calculate the fragmentation ratio as the proportion of wasted memory in small + * bins (which are defraggable) relative to the total allocated memory (including large bins). + * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, + * despite the fact it's not a lot of memory for the user. */ + float frag_pct = (float)frag_smallbins_bytes / allocated * 100; + float rss_pct = ((float)resident / allocated) * 100 - 100; + size_t rss_bytes = resident - allocated; + if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes; + serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", + allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); + return frag_pct; +} +#elif defined(DEBUG_FORCE_DEFRAG) +int allocatorDefragInit(void) { + return 0; +} +void allocatorDefragFree(void *ptr, size_t size) { + UNUSED(size); + zfree(ptr); +} +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) { + return zmalloc(size); + return NULL; +} +unsigned long allocatorDefragGetFragSmallbins(void) { + return 0; +} + +int allocatorShouldDefrag(void *ptr) { + UNUSED(ptr); + return 1; +} + +float getAllocatorFragmentation(size_t *out_frag_bytes) { + *out_frag_bytes = server.active_defrag_ignore_bytes + 1; + return server.active_defrag_threshold_upper; +} + +#else int allocatorDefragInit(void) { return -1; } @@ -423,4 +469,9 @@ int allocatorShouldDefrag(void *ptr) { UNUSED(ptr); return 0; } + +float getAllocatorFragmentation(size_t *out_frag_bytes) { + UNUSED(out_frag_bytes); + return 0; +} #endif diff --git a/src/allocator_defrag.h b/src/allocator_defrag.h index 7fb56208b6..7947bef72c 100644 --- a/src/allocator_defrag.h +++ b/src/allocator_defrag.h @@ -5,10 +5,11 @@ #include /* We can enable the server defrag capabilities only if we are using Jemalloc * and the version that has the experimental.utilization namespace in mallctl . */ -#if defined(JEMALLOC_VERSION_MAJOR) && \ - (JEMALLOC_VERSION_MAJOR > 5 || \ - (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \ - (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1)) +#if (defined(JEMALLOC_VERSION_MAJOR) && \ + (JEMALLOC_VERSION_MAJOR > 5 || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1))) || \ + defined(DEBUG_FORCE_DEFRAG) #define HAVE_DEFRAG #endif #endif @@ -18,5 +19,6 @@ void allocatorDefragFree(void *ptr, size_t size); __attribute__((malloc)) void *allocatorDefragAlloc(size_t size); unsigned long allocatorDefragGetFragSmallbins(void); int allocatorShouldDefrag(void *ptr); +float getAllocatorFragmentation(size_t *out_frag_bytes); #endif /* __ALLOCATOR_DEFRAG_H */ diff --git a/src/aof.c b/src/aof.c index 0fd3cf5c26..fc3f935db4 100644 --- a/src/aof.c +++ b/src/aof.c @@ -31,6 +31,7 @@ #include "bio.h" #include "rio.h" #include "functions.h" +#include "module.h" #include #include @@ -1010,16 +1011,22 @@ int startAppendOnly(void) { * the first call is short, there is a end-of-space condition, so the next * is likely to fail. However apparently in modern systems this is no longer * true, and in general it looks just more resilient to retry the write. If - * there is an actual error condition we'll get it at the next try. */ -ssize_t aofWrite(int fd, const char *buf, size_t len) { - ssize_t nwritten = 0, totwritten = 0; + * there is an actual error condition we'll get it at the next try. + * We also check for aof-max-size limit here returning custom error on exceed. */ +ssize_t aofWrite(int fd, const char *buf, size_t len, off_t aof_current_size, unsigned long long aof_max_size) { + ssize_t nwritten = 0, totwritten = 0, nonewritten = -1; + + if (aof_max_size && (unsigned long long)aof_current_size >= aof_max_size) { + errno = EFBIG; + return nonewritten; + } while (len) { nwritten = write(fd, buf, len); if (nwritten < 0) { if (errno == EINTR) continue; - return totwritten ? totwritten : -1; + return totwritten ? totwritten : nonewritten; } len -= nwritten; @@ -1119,7 +1126,7 @@ void flushAppendOnlyFile(int force) { } latencyStartMonitor(latency); - nwritten = aofWrite(server.aof_fd, server.aof_buf, sdslen(server.aof_buf)); + nwritten = aofWrite(server.aof_fd, server.aof_buf, sdslen(server.aof_buf), server.aof_current_size, server.aof_max_size); latencyEndMonitor(latency); /* We want to capture different events for delayed writes: * when the delay happens with a pending fsync, or with a saving child @@ -1151,7 +1158,7 @@ void flushAppendOnlyFile(int force) { /* Log the AOF write error and record the error code. */ if (nwritten == -1) { if (can_log) { - serverLog(LL_WARNING, "Error writing to the AOF file: %s", strerror(errno)); + serverLog(LL_WARNING, "Error writing to the AOF file: %s", getAofWriteErrStr(errno)); } server.aof_last_write_errno = errno; } else { @@ -1375,7 +1382,8 @@ struct client *createAOFClient(void) { /* We set the fake client as a replica waiting for the synchronization * so that the server will not try to send replies to this client. */ - c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; + initClientReplicationData(c); + c->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; return c; } @@ -1888,30 +1896,29 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) { } } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; - - while ((de = dictNext(di)) != NULL) { - sds ele = dictGetKey(de); - double *score = dictGetVal(de); - + hashtableIterator iter; + hashtableInitIterator(&iter, zs->ht, 0); + void *next; + while (hashtableNext(&iter, &next)) { + zskiplistNode *node = next; if (count == 0) { int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? AOF_REWRITE_ITEMS_PER_CMD : items; if (!rioWriteBulkCount(r, '*', 2 + cmd_items * 2) || !rioWriteBulkString(r, "ZADD", 4) || !rioWriteBulkObject(r, key)) { - dictReleaseIterator(di); + hashtableResetIterator(&iter); return 0; } } - if (!rioWriteBulkDouble(r, *score) || !rioWriteBulkString(r, ele, sdslen(ele))) { - dictReleaseIterator(di); + sds ele = node->ele; + if (!rioWriteBulkDouble(r, node->score) || !rioWriteBulkString(r, ele, sdslen(ele))) { + hashtableResetIterator(&iter); return 0; } if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; items--; } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { serverPanic("Unknown sorted zset encoding"); } @@ -1921,7 +1928,7 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) { /* Write either the key or the value of the currently selected item of a hash. * The 'hi' argument passes a valid hash iterator. * The 'what' filed specifies if to write a key or a value and can be - * either OBJ_HASH_KEY or OBJ_HASH_VALUE. + * either OBJ_HASH_FIELD or OBJ_HASH_VALUE. * * The function returns 0 on error, non-zero on success. */ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { @@ -1935,7 +1942,7 @@ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { return rioWriteBulkString(r, (char *)vstr, vlen); else return rioWriteBulkLongLong(r, vll); - } else if (hi->encoding == OBJ_ENCODING_HT) { + } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) { sds value = hashTypeCurrentFromHashTable(hi, what); return rioWriteBulkString(r, value, sdslen(value)); } @@ -1962,7 +1969,7 @@ int rewriteHashObject(rio *r, robj *key, robj *o) { } } - if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_KEY) || !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) { + if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_FIELD) || !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) { hashTypeResetIterator(&hi); return 0; } @@ -2161,7 +2168,7 @@ int rewriteModuleObject(rio *r, robj *key, robj *o, int dbid) { ValkeyModuleIO io; moduleValue *mv = o->ptr; moduleType *mt = mv->type; - moduleInitIOContext(io, mt, r, key, dbid); + moduleInitIOContext(&io, mt, r, key, dbid); mt->aof_rewrite(&io, key, mv->value); if (io.ctx) { moduleFreeContext(io.ctx); @@ -2216,7 +2223,7 @@ int rewriteAppendOnlyFileRio(rio *aof) { if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr; if (rioWriteBulkLongLong(aof, j) == 0) goto werr; - kvs_it = kvstoreIteratorInit(db->keys); + kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES); /* Iterate this DB writing every entry */ void *next; while (kvstoreIteratorNext(kvs_it, &next)) { diff --git a/src/blocked.c b/src/blocked.c index aeec560b3f..1edb7728cc 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -61,10 +61,11 @@ */ #include "server.h" -#include "slowlog.h" +#include "commandlog.h" #include "latency.h" #include "monotonic.h" #include "cluster_slot_stats.h" +#include "module.h" /* forward declarations */ static void unblockClientWaitingData(client *c); @@ -74,16 +75,25 @@ static void moduleUnblockClientOnKey(client *c, robj *key); static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key); void initClientBlockingState(client *c) { - c->bstate.btype = BLOCKED_NONE; - c->bstate.timeout = 0; - c->bstate.unblock_on_nokey = 0; - c->bstate.keys = dictCreate(&objectKeyHeapPointerValueDictType); - c->bstate.numreplicas = 0; - c->bstate.numlocal = 0; - c->bstate.reploffset = 0; - c->bstate.generic_blocked_list_node = NULL; - c->bstate.module_blocked_handle = NULL; - c->bstate.async_rm_call_handle = NULL; + if (c->bstate) return; + c->bstate = zmalloc(sizeof(blockingState)); + c->bstate->btype = BLOCKED_NONE; + c->bstate->timeout = 0; + c->bstate->unblock_on_nokey = 0; + c->bstate->keys = dictCreate(&objectKeyHeapPointerValueDictType); + c->bstate->numreplicas = 0; + c->bstate->numlocal = 0; + c->bstate->reploffset = 0; + c->bstate->generic_blocked_list_node = NULL; + c->bstate->module_blocked_handle = NULL; + c->bstate->async_rm_call_handle = NULL; +} + +void freeClientBlockingState(client *c) { + if (!c->bstate) return; + dictRelease(c->bstate->keys); + zfree(c->bstate); + c->bstate = NULL; } /* Block a client for the specific operation type. Once the CLIENT_BLOCKED @@ -93,8 +103,10 @@ void blockClient(client *c, int btype) { /* Primary client should never be blocked unless pause or module */ serverAssert(!(c->flag.primary && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE)); + initClientBlockingState(c); + c->flag.blocked = 1; - c->bstate.btype = btype; + c->bstate->btype = btype; if (!c->flag.module) server.blocked_clients++; /* We count blocked client stats on regular clients and not on module clients */ server.blocked_clients_by_type[btype]++; @@ -105,19 +117,31 @@ void blockClient(client *c, int btype) { * he will attempt to reprocess the command which will update the statistics. * However in case the client was timed out or in case of module blocked client is being unblocked * the command will not be reprocessed and we need to make stats update. - * This function will make updates to the commandstats, slot-stats, slowlog and monitors.*/ -void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int had_errors) { - const ustime_t total_cmd_duration = c->duration + blocked_us + reply_us; - c->lastcmd->microseconds += total_cmd_duration; - clusterSlotStatsAddCpuDuration(c, total_cmd_duration); + * This function will make updates to the commandstats, slot-stats, commandlog and monitors. + * The failed_or_rejected parameter is an indication that the blocked command was either failed internally or + * rejected/aborted externally. In case the command was rejected the value ERROR_COMMAND_REJECTED should be passed. + * In case the command failed internally, ERROR_COMMAND_FAILED should be passed. + * A value of zero indicate no error was reported after the command was unblocked */ +void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_or_rejected) { + c->duration += blocked_us + reply_us; + c->lastcmd->microseconds += c->duration; + clusterSlotStatsAddCpuDuration(c, c->duration); c->lastcmd->calls++; c->commands_processed++; server.stat_numcommands++; - if (had_errors) c->lastcmd->failed_calls++; + debugServerAssertWithInfo(c, NULL, failed_or_rejected >= 0 && failed_or_rejected <= ERROR_COMMAND_FAILED); + if (failed_or_rejected) { + if (failed_or_rejected & ERROR_COMMAND_FAILED) + c->lastcmd->failed_calls++; + else if (failed_or_rejected & ERROR_COMMAND_REJECTED) + c->lastcmd->rejected_calls++; + else + debugServerAssertWithInfo(c, NULL, 0); + } if (server.latency_tracking_enabled) - updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), total_cmd_duration * 1000); - /* Log the command into the Slow log if needed. */ - slowlogPushCurrentCommand(c, c->lastcmd, total_cmd_duration); + updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), c->duration * 1000); + /* Log the command into the commandlog if needed. */ + commandlogPushCurrentCommand(c, c->lastcmd); c->duration = 0; /* Log the reply duration event. */ latencyAddSampleIfNeeded("command-unblocking", reply_us / 1000); @@ -186,18 +210,18 @@ void queueClientForReprocessing(client *c) { /* Unblock a client calling the right function depending on the kind * of operation the client is blocking for. */ void unblockClient(client *c, int queue_for_reprocessing) { - if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM) { + if (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || c->bstate->btype == BLOCKED_STREAM) { unblockClientWaitingData(c); - } else if (c->bstate.btype == BLOCKED_WAIT) { + } else if (c->bstate->btype == BLOCKED_WAIT) { unblockClientWaitingReplicas(c); - } else if (c->bstate.btype == BLOCKED_MODULE) { + } else if (c->bstate->btype == BLOCKED_MODULE) { if (moduleClientIsBlockedOnKeys(c)) unblockClientWaitingData(c); unblockClientFromModule(c); - } else if (c->bstate.btype == BLOCKED_POSTPONE) { - serverAssert(c->bstate.postponed_list_node); - listDelNode(server.postponed_clients, c->bstate.postponed_list_node); - c->bstate.postponed_list_node = NULL; - } else if (c->bstate.btype == BLOCKED_SHUTDOWN) { + } else if (c->bstate->btype == BLOCKED_POSTPONE) { + serverAssert(c->bstate->postponed_list_node); + listDelNode(server.postponed_clients, c->bstate->postponed_list_node); + c->bstate->postponed_list_node = NULL; + } else if (c->bstate->btype == BLOCKED_SHUTDOWN) { /* No special cleanup. */ } else { serverPanic("Unknown btype in unblockClient()."); @@ -205,7 +229,7 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* Reset the client for a new query, unless the client has pending command to process * or in case a shutdown operation was canceled and we are still in the processCommand sequence */ - if (!c->flag.pending_command && c->bstate.btype != BLOCKED_SHUTDOWN) { + if (!c->flag.pending_command && c->bstate->btype != BLOCKED_SHUTDOWN) { /* Clients that are not blocked on keys are not reprocessed so we must * call reqresAppendResponse here (for clients blocked on key, * unblockClientOnKey is called, which eventually calls processCommand, @@ -216,12 +240,12 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* We count blocked client stats on regular clients and not on module clients */ if (!c->flag.module) server.blocked_clients--; - server.blocked_clients_by_type[c->bstate.btype]--; + server.blocked_clients_by_type[c->bstate->btype]--; /* Clear the flags, and put the client in the unblocked list so that * we'll process new commands in its query buffer ASAP. */ c->flag.blocked = 0; - c->bstate.btype = BLOCKED_NONE; - c->bstate.unblock_on_nokey = 0; + c->bstate->btype = BLOCKED_NONE; + c->bstate->unblock_on_nokey = 0; removeClientFromTimeoutTable(c); if (queue_for_reprocessing) queueClientForReprocessing(c); } @@ -230,22 +254,22 @@ void unblockClient(client *c, int queue_for_reprocessing) { * send it a reply of some kind. After this function is called, * unblockClient() will be called with the same client as argument. */ void replyToBlockedClientTimedOut(client *c) { - if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM) { + if (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || c->bstate->btype == BLOCKED_STREAM) { addReplyNullArray(c); updateStatsOnUnblock(c, 0, 0, 0); - } else if (c->bstate.btype == BLOCKED_WAIT) { + } else if (c->bstate->btype == BLOCKED_WAIT) { if (c->cmd->proc == waitCommand) { - addReplyLongLong(c, replicationCountAcksByOffset(c->bstate.reploffset)); + addReplyLongLong(c, replicationCountAcksByOffset(c->bstate->reploffset)); } else if (c->cmd->proc == waitaofCommand) { addReplyArrayLen(c, 2); - addReplyLongLong(c, server.fsynced_reploff >= c->bstate.reploffset); - addReplyLongLong(c, replicationCountAOFAcksByOffset(c->bstate.reploffset)); + addReplyLongLong(c, server.fsynced_reploff >= c->bstate->reploffset); + addReplyLongLong(c, replicationCountAOFAcksByOffset(c->bstate->reploffset)); } else if (c->cmd->proc == clusterCommand) { addReplyErrorObject(c, shared.noreplicaserr); } else { serverPanic("Unknown wait command %s in replyToBlockedClientTimedOut().", c->cmd->declared_name); } - } else if (c->bstate.btype == BLOCKED_MODULE) { + } else if (c->bstate->btype == BLOCKED_MODULE) { moduleBlockedClientTimedOut(c, 0); } else { serverPanic("Unknown btype in replyToBlockedClientTimedOut()."); @@ -261,7 +285,7 @@ void replyToClientsBlockedOnShutdown(void) { listRewind(server.clients, &li); while ((ln = listNext(&li))) { client *c = listNodeValue(ln); - if (c->flag.blocked && c->bstate.btype == BLOCKED_SHUTDOWN) { + if (c->flag.blocked && c->bstate->btype == BLOCKED_SHUTDOWN) { addReplyError(c, "Errors trying to SHUTDOWN. Check logs."); unblockClient(c, 1); } @@ -288,7 +312,7 @@ void disconnectAllBlockedClients(void) { * command processing will start from scratch, and the command will * be either executed or rejected. (unlike LIST blocked clients for * which the command is already in progress in a way. */ - if (c->bstate.btype == BLOCKED_POSTPONE) continue; + if (c->bstate->btype == BLOCKED_POSTPONE) continue; unblockClientOnError(c, "-UNBLOCKED force unblock from blocking operation, " "instance state changed (master -> replica?)"); @@ -373,15 +397,17 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo list *l; int j; + initClientBlockingState(c); + if (!c->flag.reprocessing_command) { /* If the client is re-processing the command, we do not set the timeout * because we need to retain the client's original timeout. */ - c->bstate.timeout = timeout; + c->bstate->timeout = timeout; } for (j = 0; j < numkeys; j++) { /* If the key already exists in the dictionary ignore it. */ - if (!(client_blocked_entry = dictAddRaw(c->bstate.keys, keys[j], NULL))) { + if (!(client_blocked_entry = dictAddRaw(c->bstate->keys, keys[j], NULL))) { continue; } incrRefCount(keys[j]); @@ -398,7 +424,7 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo l = dictGetVal(db_blocked_existing_entry); } listAddNodeTail(l, c); - dictSetVal(c->bstate.keys, client_blocked_entry, listLast(l)); + dictSetVal(c->bstate->keys, client_blocked_entry, listLast(l)); /* We need to add the key to blocking_keys_unblock_on_nokey, if the client * wants to be awakened if key is deleted (like XREADGROUP) */ @@ -412,7 +438,7 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo } } } - c->bstate.unblock_on_nokey = unblock_on_nokey; + c->bstate->unblock_on_nokey = unblock_on_nokey; /* Currently we assume key blocking will require reprocessing the command. * However in case of modules, they have a different way to handle the reprocessing * which does not require setting the pending command flag */ @@ -426,15 +452,15 @@ static void unblockClientWaitingData(client *c) { dictEntry *de; dictIterator *di; - if (dictSize(c->bstate.keys) == 0) return; + if (dictSize(c->bstate->keys) == 0) return; - di = dictGetIterator(c->bstate.keys); + di = dictGetIterator(c->bstate->keys); /* The client may wait for multiple keys, so unblock it for every key. */ while ((de = dictNext(di)) != NULL) { releaseBlockedEntry(c, de, 0); } dictReleaseIterator(di); - dictEmpty(c->bstate.keys, NULL); + dictEmpty(c->bstate->keys, NULL); } static blocking_type getBlockedTypeByType(int type) { @@ -533,7 +559,7 @@ static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key) { if (listLength(l) == 0) { dictDelete(c->db->blocking_keys, key); dictDelete(c->db->blocking_keys_unblock_on_nokey, key); - } else if (c->bstate.unblock_on_nokey) { + } else if (c->bstate->unblock_on_nokey) { unblock_on_nokey_entry = dictFind(c->db->blocking_keys_unblock_on_nokey, key); /* it is not possible to have a client blocked on nokey with no matching entry */ serverAssertWithInfo(c, key, unblock_on_nokey_entry != NULL); @@ -542,7 +568,7 @@ static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key) { dictDelete(c->db->blocking_keys_unblock_on_nokey, key); } } - if (remove_key) dictDelete(c->bstate.keys, key); + if (remove_key) dictDelete(c->bstate->keys, key); } void signalKeyAsReady(serverDb *db, robj *key, int type) { @@ -580,9 +606,9 @@ static void handleClientsBlockedOnKey(readyList *rl) { * module is trying to accomplish right now. * 3. In case of XREADGROUP call we will want to unblock on any change in object type * or in case the key was deleted, since the group is no longer valid. */ - if ((o != NULL && (receiver->bstate.btype == getBlockedTypeByType(o->type))) || - (o != NULL && (receiver->bstate.btype == BLOCKED_MODULE)) || (receiver->bstate.unblock_on_nokey)) { - if (receiver->bstate.btype != BLOCKED_MODULE) + if ((o != NULL && (receiver->bstate->btype == getBlockedTypeByType(o->type))) || + (o != NULL && (receiver->bstate->btype == BLOCKED_MODULE)) || (receiver->bstate->unblock_on_nokey)) { + if (receiver->bstate->btype != BLOCKED_MODULE) unblockClientOnKey(receiver, rl->key); else moduleUnblockClientOnKey(receiver, rl->key); @@ -593,16 +619,17 @@ static void handleClientsBlockedOnKey(readyList *rl) { /* block a client for replica acknowledgement */ void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, long numreplicas, int numlocal) { - c->bstate.timeout = timeout; - c->bstate.reploffset = offset; - c->bstate.numreplicas = numreplicas; - c->bstate.numlocal = numlocal; + initClientBlockingState(c); + c->bstate->timeout = timeout; + c->bstate->reploffset = offset; + c->bstate->numreplicas = numreplicas; + c->bstate->numlocal = numlocal; listAddNodeHead(server.clients_waiting_acks, c); /* Note that we remember the linked list node where the client is stored, * this way removing the client in unblockClientWaitingReplicas() will not * require a linear scan, but just a constant time operation. */ - serverAssert(c->bstate.client_waiting_acks_list_node == NULL); - c->bstate.client_waiting_acks_list_node = listFirst(server.clients_waiting_acks); + serverAssert(c->bstate->client_waiting_acks_list_node == NULL); + c->bstate->client_waiting_acks_list_node = listFirst(server.clients_waiting_acks); blockClient(c, BLOCKED_WAIT); } @@ -610,11 +637,12 @@ void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, lon * requesting to avoid processing clients commands which will be processed later * when the it is ready to accept them. */ void blockPostponeClient(client *c) { - c->bstate.timeout = 0; + initClientBlockingState(c); + c->bstate->timeout = 0; blockClient(c, BLOCKED_POSTPONE); listAddNodeTail(server.postponed_clients, c); - serverAssert(c->bstate.postponed_list_node == NULL); - c->bstate.postponed_list_node = listLast(server.postponed_clients); + serverAssert(c->bstate->postponed_list_node == NULL); + c->bstate->postponed_list_node = listLast(server.postponed_clients); /* Mark this client to execute its command */ c->flag.pending_command = 1; } @@ -631,13 +659,13 @@ void blockClientShutdown(client *c) { static void unblockClientOnKey(client *c, robj *key) { dictEntry *de; - de = dictFind(c->bstate.keys, key); + de = dictFind(c->bstate->keys, key); releaseBlockedEntry(c, de, 1); /* Only in case of blocking API calls, we might be blocked on several keys. however we should force unblock the entire blocking keys */ - serverAssert(c->bstate.btype == BLOCKED_STREAM || c->bstate.btype == BLOCKED_LIST || - c->bstate.btype == BLOCKED_ZSET); + serverAssert(c->bstate->btype == BLOCKED_STREAM || c->bstate->btype == BLOCKED_LIST || + c->bstate->btype == BLOCKED_ZSET); /* We need to unblock the client before calling processCommandAndResetClient * because it checks the CLIENT_BLOCKED flag */ @@ -680,7 +708,8 @@ static void moduleUnblockClientOnKey(client *c, robj *key) { elapsedStart(&replyTimer); if (moduleTryServeClientBlockedOnKey(c, key)) { - updateStatsOnUnblock(c, 0, elapsedUs(replyTimer), server.stat_total_error_replies != prev_error_replies); + updateStatsOnUnblock(c, 0, elapsedUs(replyTimer), + ((server.stat_total_error_replies != prev_error_replies) ? ERROR_COMMAND_FAILED : 0)); moduleUnblockClient(c); } /* We need to call afterCommand even if the client was not unblocked @@ -698,7 +727,7 @@ static void moduleUnblockClientOnKey(client *c, robj *key) { * command with timeout reply. */ void unblockClientOnTimeout(client *c) { /* The client has been unlocked (in the moduleUnblocked list), return ASAP. */ - if (c->bstate.btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return; + if (c->bstate->btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return; replyToBlockedClientTimedOut(c); if (c->flag.pending_command) c->flag.pending_command = 0; @@ -709,7 +738,7 @@ void unblockClientOnTimeout(client *c) { * If err_str is provided it will be used to reply to the blocked client */ void unblockClientOnError(client *c, const char *err_str) { if (err_str) addReplyError(c, err_str); - updateStatsOnUnblock(c, 0, 0, 1); + updateStatsOnUnblock(c, 0, 0, ERROR_COMMAND_REJECTED); if (c->flag.pending_command) c->flag.pending_command = 0; unblockClient(c, 1); } diff --git a/src/call_reply.c b/src/call_reply.c index 00d196081e..dc981b8be8 100644 --- a/src/call_reply.c +++ b/src/call_reply.c @@ -559,7 +559,7 @@ CallReply *callReplyCreateError(sds reply, void *private_data) { sdsfree(reply); } list *deferred_error_list = listCreate(); - listSetFreeMethod(deferred_error_list, (void (*)(void *))sdsfree); + listSetFreeMethod(deferred_error_list, sdsfreeVoid); listAddNodeTail(deferred_error_list, sdsnew(err_buff)); return callReplyCreate(err_buff, deferred_error_list, private_data); } diff --git a/src/cluster.c b/src/cluster.c index df6bb86454..cedcd9ecb1 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -36,6 +36,7 @@ #include "server.h" #include "cluster.h" #include "cluster_slot_stats.h" +#include "module.h" #include @@ -909,7 +910,7 @@ void clusterCommand(client *c) { unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; addReplyArrayLen(c, numkeys); kvstoreHashtableIterator *kvs_di = NULL; - kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot); + kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot, 0); for (unsigned int i = 0; i < numkeys; i++) { void *next; serverAssert(kvstoreHashtableIteratorNext(kvs_di, &next)); @@ -1005,7 +1006,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int /* If CLIENT_MULTI flag is not set EXEC is just going to return an * error. */ if (!c->flag.multi) return myself; - ms = &c->mstate; + ms = c->mstate; } else { /* In order to have a single codepath create a fake Multi State * structure if the client is not in MULTI/EXEC state, this way @@ -1022,7 +1023,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int /* Only valid for sharded pubsub as regular pubsub can operate on any node and bypasses this layer. */ int pubsubshard_included = - (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_PUBSUB)); + (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_PUBSUB)); /* Check that all the keys are in the same hash slot, and obtain this * slot and the node associated. */ @@ -1175,7 +1176,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int * node is a replica and the request is about a hash slot our primary * is serving, we can reply without redirection. */ int is_write_command = - (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); + (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_WRITE)); if ((c->flag.readonly || pubsubshard_included) && !is_write_command && clusterNodeIsReplica(myself) && clusterNodeGetPrimary(myself) == n) { return myself; @@ -1232,14 +1233,14 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co * returns 1. Otherwise 0 is returned and no operation is performed. */ int clusterRedirectBlockedClientIfNeeded(client *c) { clusterNode *myself = getMyClusterNode(); - if (c->flag.blocked && (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || - c->bstate.btype == BLOCKED_STREAM || c->bstate.btype == BLOCKED_MODULE)) { + if (c->flag.blocked && (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || + c->bstate->btype == BLOCKED_STREAM || c->bstate->btype == BLOCKED_MODULE)) { dictEntry *de; dictIterator *di; /* If the client is blocked on module, but not on a specific key, * don't unblock it. */ - if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) return 0; + if (c->bstate->btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) return 0; /* If the cluster is down, unblock the client with the right error. * If the cluster is configured to allow reads on cluster down, we @@ -1251,7 +1252,7 @@ int clusterRedirectBlockedClientIfNeeded(client *c) { } /* All keys must belong to the same slot, so check first key only. */ - di = dictGetIterator(c->bstate.keys); + di = dictGetIterator(c->bstate->keys); if ((de = dictNext(di)) != NULL) { robj *key = dictGetKey(de); int slot = keyHashSlot((char *)key->ptr, sdslen(key->ptr)); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a273fe0d86..5e976d3060 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -38,6 +38,7 @@ #include "cluster_slot_stats.h" #include "endianconv.h" #include "connection.h" +#include "module.h" #include #include @@ -121,6 +122,7 @@ void freeClusterLink(clusterLink *link); int verifyClusterNodeId(const char *name, int length); sds clusterEncodeOpenSlotsAuxField(int rdbflags); int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s); +static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now); /* Only primaries that own slots have voting rights. * Returns 1 if the node has voting rights, otherwise returns 0. */ @@ -424,9 +426,19 @@ typedef struct { union { clusterMsg msg; clusterMsgLight msg_light; - }; + } data[]; } clusterMsgSendBlock; +/* Helper function to extract a normal message from a send block. */ +static clusterMsgLight *getLightMessageFromSendBlock(clusterMsgSendBlock *msgblock) { + return &msgblock->data[0].msg_light; +} + +/* Helper function to extract a light message from a send block. */ +static clusterMsg *getMessageFromSendBlock(clusterMsgSendBlock *msgblock) { + return &msgblock->data[0].msg; +} + /* ----------------------------------------------------------------------------- * Initialization * -------------------------------------------------------------------------- */ @@ -654,7 +666,8 @@ int clusterLoadConfig(char *filename) { } else if (!strcasecmp(s, "handshake")) { n->flags |= CLUSTER_NODE_HANDSHAKE; } else if (!strcasecmp(s, "noaddr")) { - n->flags |= CLUSTER_NODE_NOADDR; + n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); + n->fail_time = mstime(); } else if (!strcasecmp(s, "nofailover")) { n->flags |= CLUSTER_NODE_NOFAILOVER; } else if (!strcasecmp(s, "noflags")) { @@ -804,6 +817,7 @@ int clusterSaveConfig(int do_fsync) { ssize_t written_bytes; int fd = -1; int retval = C_ERR; + mstime_t latency; server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG; @@ -817,11 +831,15 @@ int clusterSaveConfig(int do_fsync) { /* Create a temp file with the new content. */ tmpfilename = sdscatfmt(sdsempty(), "%s.tmp-%i-%I", server.cluster_configfile, (int)getpid(), mstime()); + latencyStartMonitor(latency); if ((fd = open(tmpfilename, O_WRONLY | O_CREAT, 0644)) == -1) { serverLog(LL_WARNING, "Could not open temp cluster config file: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-open", latency); + latencyStartMonitor(latency); while (offset < content_size) { written_bytes = write(fd, ci + offset, content_size - offset); if (written_bytes <= 0) { @@ -832,31 +850,52 @@ int clusterSaveConfig(int do_fsync) { } offset += written_bytes; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-write", latency); if (do_fsync) { + latencyStartMonitor(latency); server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG; if (valkey_fsync(fd) == -1) { serverLog(LL_WARNING, "Could not sync tmp cluster config file: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-fsync", latency); } + latencyStartMonitor(latency); if (rename(tmpfilename, server.cluster_configfile) == -1) { serverLog(LL_WARNING, "Could not rename tmp cluster config file: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-rename", latency); if (do_fsync) { + latencyStartMonitor(latency); if (fsyncFileDir(server.cluster_configfile) == -1) { serverLog(LL_WARNING, "Could not sync cluster config file dir: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-dir-fsync", latency); } retval = C_OK; /* If we reached this point, everything is fine. */ cleanup: - if (fd != -1) close(fd); - if (retval) unlink(tmpfilename); + if (fd != -1) { + latencyStartMonitor(latency); + close(fd); + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-close", latency); + } + if (retval == C_ERR) { + latencyStartMonitor(latency); + unlink(tmpfilename); + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-unlink", latency); + } sdsfree(tmpfilename); sdsfree(ci); return retval; @@ -1091,6 +1130,7 @@ void clusterInit(void) { server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_rank = 0; + server.cluster->failover_failed_primary_rank = 0; server.cluster->failover_auth_epoch = 0; server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; server.cluster->lastVoteEpoch = 0; @@ -1288,15 +1328,15 @@ void clusterReset(int hard) { * CLUSTER communication link * -------------------------------------------------------------------------- */ clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) { - uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, msg); + uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, data); clusterMsgSendBlock *msgblock = zcalloc(blocklen); msgblock->refcount = 1; msgblock->totlen = blocklen; server.stat_cluster_links_memory += blocklen; if (IS_LIGHT_MESSAGE(type)) { - clusterBuildMessageHdrLight(&msgblock->msg_light, type, msglen); + clusterBuildMessageHdrLight(getLightMessageFromSendBlock(msgblock), type, msglen); } else { - clusterBuildMessageHdr(&msgblock->msg, type, msglen); + clusterBuildMessageHdr(getMessageFromSendBlock(msgblock), type, msglen); } return msgblock; } @@ -1336,6 +1376,11 @@ clusterLink *createClusterLink(clusterNode *node) { * with this link will have the 'link' field set to NULL. */ void freeClusterLink(clusterLink *link) { serverAssert(link != NULL); + serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s (%s)", + link->node ? link->node->name : "", + link->inbound ? "inbound" : "outbound", + link->node ? link->node->human_nodename : ""); + if (link->conn) { connClose(link->conn); link->conn = NULL; @@ -1351,6 +1396,7 @@ void freeClusterLink(clusterLink *link) { } else if (link->node->inbound_link == link) { serverAssert(link->inbound); link->node->inbound_link = NULL; + link->node->inbound_link_freed_time = mstime(); } } zfree(link); @@ -1487,9 +1533,11 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->last_in_ping_gossip = 0; node->ping_sent = node->pong_received = 0; node->data_received = 0; + node->meet_sent = 0; node->fail_time = 0; node->link = NULL; node->inbound_link = NULL; + node->inbound_link_freed_time = node->ctime; memset(node->ip, 0, sizeof(node->ip)); node->announce_client_ipv4 = sdsempty(); node->announce_client_ipv6 = sdsempty(); @@ -1499,7 +1547,6 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->cport = 0; node->tls_port = 0; node->fail_reports = listCreate(); - node->voted_time = 0; node->orphaned_time = 0; node->repl_offset_time = 0; node->repl_offset = 0; @@ -1547,9 +1594,14 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { * older than the global node timeout. Note that anyway for a node to be * flagged as FAIL we need to have a local PFAIL state that is at least * older than the global node timeout, so we don't just trust the number - * of failure reports from other nodes. */ + * of failure reports from other nodes. + * + * If the reporting node loses its voting right during this time, we will + * also clear its report. */ void clusterNodeCleanupFailureReports(clusterNode *node) { list *l = node->fail_reports; + if (!listLength(l)) return; + listNode *ln; listIter li; clusterNodeFailReport *fr; @@ -1559,7 +1611,11 @@ void clusterNodeCleanupFailureReports(clusterNode *node) { listRewind(l, &li); while ((ln = listNext(&li)) != NULL) { fr = ln->value; - if (now - fr->time > maxtime) listDelNode(l, ln); + if (now - fr->time > maxtime) { + listDelNode(l, ln); + } else if (!clusterNodeIsVotingPrimary(fr->node)) { + listDelNode(l, ln); + } } } @@ -1576,6 +1632,8 @@ void clusterNodeCleanupFailureReports(clusterNode *node) { * Otherwise 0 is returned. */ int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) { list *l = node->fail_reports; + if (!listLength(l)) return 0; + listNode *ln; listIter li; clusterNodeFailReport *fr; @@ -1696,6 +1754,9 @@ void clusterAddNode(clusterNode *node) { * it is a replica node. */ void clusterDelNode(clusterNode *delnode) { + serverAssert(delnode != NULL); + serverLog(LL_DEBUG, "Deleting node %.40s (%s) from cluster view", delnode->name, delnode->human_nodename); + int j; dictIterator *di; dictEntry *de; @@ -2078,7 +2139,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { /* Return 1 if we already have a node in HANDSHAKE state matching the * specified ip address and port number. This function is used in order to * avoid adding a new handshake node for the same address multiple times. */ -int clusterHandshakeInProgress(char *ip, int port, int cport) { +static int clusterHandshakeInProgress(char *ip, int port, int cport) { dictIterator *di; dictEntry *de; @@ -2100,7 +2161,7 @@ int clusterHandshakeInProgress(char *ip, int port, int cport) { * * EAGAIN - There is already a handshake in progress for this address. * EINVAL - IP or port are not valid. */ -int clusterStartHandshake(char *ip, int port, int cport) { +static int clusterStartHandshake(char *ip, int port, int cport) { clusterNode *n; char norm_ip[NET_IP_STR_LEN]; struct sockaddr_storage sa; @@ -2246,10 +2307,11 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Ignore gossips about self. */ if (node && node != myself) { /* We already know this node. - Handle failure reports, only when the sender is a voting primary. */ - if (sender && clusterNodeIsVotingPrimary(sender)) { + * Handle failure reports, the report is added only if the sender is a voting primary, + * and deletion of a failure report is not restricted. */ + if (sender) { if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) { - if (clusterNodeAddFailureReport(node, sender)) { + if (clusterNodeIsVotingPrimary(sender) && clusterNodeAddFailureReport(node, sender)) { serverLog(LL_NOTICE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", sender->name, sender->human_nodename, node->name, node->human_nodename); } @@ -2874,6 +2936,10 @@ void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) { if (n && n != myself && !(nodeIsReplica(myself) && myself->replicaof == n)) { sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN); dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id); + if (dictGetKey(de) != id) { + /* The dict did not take ownership of the id string, so we need to free it. */ + sdsfree(id); + } uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl); dictSetUnsignedIntegerVal(de, expire); clusterDelNode(n); @@ -2995,7 +3061,8 @@ int clusterIsValidPacket(clusterLink *link) { } if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) { - serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); + serverLog(LL_WARNING, "Dropping packet of type %s that matches debug drop filter", + clusterGetMessageTypeString(type)); return 0; } @@ -3086,7 +3153,7 @@ int clusterProcessPacket(clusterLink *link) { if (server.debug_cluster_close_link_on_packet_drop && (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) { freeClusterLink(link); - serverLog(LL_WARNING, "Closing link for matching packet type %hu", type); + serverLog(LL_WARNING, "Closing link for matching packet type %s", clusterGetMessageTypeString(type)); return 0; } return 1; @@ -3102,8 +3169,8 @@ int clusterProcessPacket(clusterLink *link) { freeClusterLink(link); serverLog( LL_NOTICE, - "Closing link for node that sent a lightweight message of type %hu as its first message on the link", - type); + "Closing link for node that sent a lightweight message of type %s as its first message on the link", + clusterGetMessageTypeString(type)); return 0; } clusterNode *sender = link->node; @@ -3207,33 +3274,72 @@ int clusterProcessPacket(clusterLink *link) { } } - /* Add this node if it is new for us and the msg type is MEET. - * In this stage we don't try to add the node with the right - * flags, replicaof pointer, and so forth, as this details will be - * resolved when we'll receive PONGs from the node. The exception - * to this is the flag that indicates extensions are supported, as - * we want to send extensions right away in the return PONG in order - * to reduce the amount of time needed to stabilize the shard ID. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) { - clusterNode *node; - - node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); - serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK); - getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); - node->cport = ntohs(hdr->cport); - if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { - node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + if (type == CLUSTERMSG_TYPE_MEET) { + if (!sender) { + if (!link->node) { + char ip[NET_IP_STR_LEN] = {0}; + if (nodeIp2String(ip, link, hdr->myip) != C_OK) { + /* Unable to retrieve the node's IP address from the connection. Without a + * valid IP, the node becomes unusable in the cluster. This failure might be + * due to the connection being closed. */ + serverLog(LL_NOTICE, "Closing cluster link due to failure to retrieve IP from the connection, " + "possibly caused by a closed connection."); + freeClusterLink(link); + return 0; + } + + /* Add this node if it is new for us and the msg type is MEET. + * In this stage we don't try to add the node with the right + * flags, replicaof pointer, and so forth, as this details will be + * resolved when we'll receive PONGs from the node. The exception + * to this is the flag that indicates extensions are supported, as + * we want to send extensions right away in the return PONG in order + * to reduce the amount of time needed to stabilize the shard ID. */ + clusterNode *node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); + memcpy(node->ip, ip, sizeof(ip)); + getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); + node->cport = ntohs(hdr->cport); + if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { + node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + } + setClusterNodeToInboundClusterLink(node, link); + clusterAddNode(node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } else { + /* A second MEET packet was received on an existing link during the handshake + * process. This happens when the other node detects no inbound link, and + * re-sends a MEET packet before this node can respond with a PING. + * This MEET is a no-op. + * + * Note: Nodes in HANDSHAKE state are not fully "known" (random names), so the + * sender remains unidentified at this point. The MEET packet might be re-sent + * if the inbound connection is still unestablished by the next cron cycle. */ + debugServerAssert(link->inbound && nodeInHandshake(link->node)); + } + + /* If this is a MEET packet from an unknown node, we still process + * the gossip section here since we have to trust the sender because + * of the message type. */ + clusterProcessGossipSection(hdr, link); + } else if (sender->link && nodeExceedsHandshakeTimeout(sender, now)) { + /* The MEET packet is from a known node, after the handshake timeout, so the sender + * thinks that I do not know it. + * Free my outbound link to that node, triggering a reconnect and a PING over the + * new link. + * Once that node receives our PING, it should recognize the new connection as an + * inbound link from me. We should only free the outbound link if the node is known + * for more time than the handshake timeout, since during this time, the other side + * might still be trying to complete the handshake. */ + + /* We should always receive a MEET packet on an inbound link. */ + serverAssert(link != sender->link); + serverLog(LL_NOTICE, "Freeing outbound link to node %.40s (%s) after receiving a MEET packet " + "from this known node", + sender->name, sender->human_nodename); + freeClusterLink(sender->link); } - setClusterNodeToInboundClusterLink(node, link); - clusterAddNode(node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } - /* If this is a MEET packet from an unknown node, we still process - * the gossip section here since we have to trust the sender because - * of the message type. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) clusterProcessGossipSection(hdr, link); - /* Anyway reply with a PONG */ clusterSendPing(link, CLUSTERMSG_TYPE_PONG); } @@ -3243,7 +3349,7 @@ int clusterProcessPacket(clusterLink *link) { serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type), link->node ? link->node->name : "NULL"); - if (sender && (sender->flags & CLUSTER_NODE_MEET)) { + if (sender && nodeInMeetState(sender)) { /* Once we get a response for MEET from the sender, we can stop sending more MEET. */ sender->flags &= ~CLUSTER_NODE_MEET; serverLog(LL_NOTICE, "Successfully completed handshake with %.40s (%s)", sender->name, @@ -3277,7 +3383,9 @@ int clusterProcessPacket(clusterLink *link) { } else if (memcmp(link->node->name, hdr->sender, CLUSTER_NAMELEN) != 0) { /* If the reply has a non matching node ID we * disconnect this node and set it as not having an associated - * address. */ + * address. This can happen if the node did CLUSTER RESET and changed + * its node ID. In this case, the old node ID will not come back. */ + clusterNode *noaddr_node = link->node; serverLog(LL_NOTICE, "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, " "having flags %d", @@ -3289,7 +3397,19 @@ int clusterProcessPacket(clusterLink *link) { link->node->tls_port = 0; link->node->cport = 0; freeClusterLink(link); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + /* We will also mark the node as fail because we have disconnected from it, + * and will not reconnect, and obviously we will not gossip NOADDR nodes. + * Marking it as FAIL can help us advance the state, such as the cluster + * state becomes FAIL or the replica can do the failover. Otherwise, the + * NOADDR node will provide an invalid address in redirection and confuse + * the clients, and the replica will never initiate a failover since the + * node is not actually in FAIL state. */ + if (!nodeFailed(noaddr_node)) { + noaddr_node->flags |= CLUSTER_NODE_FAIL; + noaddr_node->fail_time = now; + clusterSendFail(noaddr_node->name); + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE); return 0; } } @@ -3611,7 +3731,7 @@ void clusterWriteHandler(connection *conn) { while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) { listNode *head = listFirst(link->send_msg_queue); clusterMsgSendBlock *msgblock = (clusterMsgSendBlock *)head->value; - clusterMsg *msg = &msgblock->msg; + clusterMsg *msg = getMessageFromSendBlock(msgblock); size_t msg_offset = link->head_msg_send_offset; size_t msg_len = ntohl(msg->totlen); @@ -3668,7 +3788,7 @@ void clusterLinkConnectHandler(connection *conn) { * of a PING one, to force the receiver to add us in its node * table. */ mstime_t old_ping_sent = node->ping_sent; - clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); + clusterSendPing(link, nodeInMeetState(node) ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); if (old_ping_sent) { /* If there was an active ping before the link was * disconnected, we want to restore the ping time, otherwise @@ -3747,7 +3867,9 @@ void clusterReadHandler(connection *conn) { if (nread <= 0) { /* I/O error... */ - serverLog(LL_DEBUG, "I/O error reading from node link: %s", + serverLog(LL_DEBUG, "I/O error reading from node link (%.40s:%s): %s", + link->node ? link->node->name : "", + link->inbound ? "inbound" : "outbound", (nread == 0) ? "connection closed" : connGetLastError(conn)); handleLinkIOError(link); return; @@ -3794,7 +3916,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { if (!link) { return; } - if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0) + if (listLength(link->send_msg_queue) == 0 && getMessageFromSendBlock(msgblock)->totlen != 0) connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); listAddNodeTail(link->send_msg_queue, msgblock); @@ -3805,7 +3927,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { server.stat_cluster_links_memory += sizeof(listNode); /* Populate sent messages stats. */ - uint16_t type = ntohs(msgblock->msg.type); + uint16_t type = ntohs(getMessageFromSendBlock(msgblock)->type) & ~CLUSTERMSG_MODIFIER_MASK; if (type < CLUSTERMSG_TYPE_COUNT) server.cluster->stats_bus_messages_sent[type]++; } @@ -3928,6 +4050,12 @@ void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) { /* Send a PING or PONG packet to the specified node, making sure to add enough * gossip information. */ void clusterSendPing(clusterLink *link, int type) { + serverLog(LL_DEBUG, "Sending %s packet to node %.40s (%s) on %s link", + clusterGetMessageTypeString(type), + link->node ? link->node->name : "", + link->node ? link->node->human_nodename : "", + link->inbound ? "inbound" : "outbound"); + static unsigned long long cluster_pings_sent = 0; cluster_pings_sent++; int gossipcount = 0; /* Number of gossip sections added so far. */ @@ -3985,9 +4113,14 @@ void clusterSendPing(clusterLink *link, int type) { * sizeof(clusterMsg) or more. */ if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); - if (!link->inbound && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime(); + if (!link->inbound) { + if (type == CLUSTERMSG_TYPE_PING) + link->node->ping_sent = mstime(); + else if (type == CLUSTERMSG_TYPE_MEET) + link->node->meet_sent = mstime(); + } /* Populate the gossip fields */ int maxiterations = wanted * 3; @@ -4130,10 +4263,10 @@ clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen); clusterMsgDataPublish *hdr_data_msg; if (is_light) { - clusterMsgLight *hdr_light = &msgblock->msg_light; + clusterMsgLight *hdr_light = getLightMessageFromSendBlock(msgblock); hdr_data_msg = &hdr_light->data.publish.msg; } else { - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); hdr_data_msg = &hdr->data.publish.msg; } hdr_data_msg->channel_len = htonl(channel_len); @@ -4156,7 +4289,7 @@ void clusterSendFail(char *nodename) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataFail); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); memcpy(hdr->data.fail.about.nodename, nodename, CLUSTER_NAMELEN); clusterBroadcastMessage(msgblock); @@ -4172,7 +4305,7 @@ void clusterSendUpdate(clusterLink *link, clusterNode *node) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataUpdate); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); memcpy(hdr->data.update.nodecfg.nodename, node->name, CLUSTER_NAMELEN); hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); memcpy(hdr->data.update.nodecfg.slots, node->slots, sizeof(node->slots)); @@ -4194,7 +4327,7 @@ void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, cons msglen += sizeof(clusterMsgModule) - 3 + len; clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */ hdr->data.module.msg.type = type; hdr->data.module.msg.len = htonl(len); @@ -4283,11 +4416,10 @@ void clusterRequestFailoverAuth(void) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen); - clusterMsg *hdr = &msgblock->msg; /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit * in the header to communicate the nodes receiving the message that * they should authorized the failover even if the primary is working. */ - if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; + if (server.cluster->mf_end) msgblock->data[0].msg.mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; clusterBroadcastMessage(msgblock); clusterMsgSendBlockDecrRefCount(msgblock); } @@ -4364,23 +4496,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { return; } - /* We did not voted for a replica about this primary for two - * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. - * - * This limitation does not restrict manual failover. If a user initiates - * a manual failover, we need to allow it to vote, otherwise the manual - * failover may time out. */ - if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): " - "can't vote for any replica of %.40s (%s) within %lld milliseconds", - node->name, node->human_nodename, - node->replicaof->name, node->replicaof->human_nodename, - (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time))); - return; - } - /* The replica requesting the vote must have a configEpoch for the claimed * slots that is >= the one of the primaries currently serving the same * slots in the current configuration. */ @@ -4394,7 +4509,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * by the replica requesting our vote. Refuse to vote for this replica. */ serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): " - "slot %d epoch (%llu) > reqEpoch (%llu)", + "slot %d epoch (%llu) > reqConfigEpoch (%llu)", node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch, (unsigned long long)requestConfigEpoch); return; @@ -4402,7 +4517,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this replica. */ server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - if (!force_ack) node->replicaof->voted_time = mstime(); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); clusterSendFailoverAuth(node); serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename, @@ -4449,6 +4563,45 @@ int clusterGetReplicaRank(void) { return rank; } +/* This function returns the "rank" of this instance's primary, in the context + * of all failed primary list. The primary node will be ignored if failed time + * exceeds cluster-node-timeout * cluster-replica-validity-factor. + * + * If multiple primary nodes go down at the same time, there is a certain + * probability that their replicas will initiate the elections at the same time, + * and lead to insufficient votes. + * + * The failed primary rank is used to add a delay to start an election in order + * to avoid simultaneous elections of replicas. */ +int clusterGetFailedPrimaryRank(void) { + serverAssert(nodeIsReplica(myself)); + serverAssert(myself->replicaof); + + int rank = 0; + mstime_t now = mstime(); + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while ((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + /* Skip nodes that do not need to participate in the rank. */ + if (!nodeFailed(node) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue; + + /* If cluster-replica-validity-factor is enabled, skip the invalid nodes. */ + if (server.cluster_replica_validity_factor) { + if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor)) + continue; + } + + if (memcmp(node->shard_id, myself->shard_id, CLUSTER_NAMELEN) < 0) rank++; + } + dictReleaseIterator(di); + + return rank; +} + /* This function is called by clusterHandleReplicaFailover() in order to * let the replica log why it is not able to failover. Sometimes there are * not the conditions, but since the failover function is called again and @@ -4630,6 +4783,11 @@ void clusterHandleReplicaFailover(void) { * Specifically 1 second * rank. This way replicas that have a probably * less updated replication offset, are penalized. */ server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000; + /* We add another delay that is proportional to the failed primary rank. + * Specifically 0.5 second * rank. This way those failed primaries will be + * elected in rank to avoid the vote conflicts. */ + server.cluster->failover_failed_primary_rank = clusterGetFailedPrimaryRank(); + server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * 500; /* However if this is a manual failover, no delay is needed. */ if (server.cluster->mf_end) { server.cluster->failover_auth_time = now; @@ -4640,9 +4798,9 @@ void clusterHandleReplicaFailover(void) { } serverLog(LL_NOTICE, "Start of election delayed for %lld milliseconds " - "(rank #%d, offset %lld).", + "(rank #%d, primary rank #%d, offset %lld).", server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank, - replicationGetReplicaOffset()); + server.cluster->failover_failed_primary_rank, replicationGetReplicaOffset()); /* Now that we have a scheduled election, broadcast our offset * to all the other replicas so that they'll updated their offsets * if our offset is better. */ @@ -4658,6 +4816,9 @@ void clusterHandleReplicaFailover(void) { * replicas for the same primary since we computed our election delay. * Update the delay if our rank changed. * + * It is also possible that we received the message that telling a + * shard is up. Update the delay if our failed_primary_rank changed. + * * Not performed if this is a manual failover. */ if (server.cluster->failover_auth_sent == 0 && server.cluster->mf_end == 0) { int newrank = clusterGetReplicaRank(); @@ -4668,6 +4829,15 @@ void clusterHandleReplicaFailover(void) { serverLog(LL_NOTICE, "Replica rank updated to #%d, added %lld milliseconds of delay.", newrank, added_delay); } + + int new_failed_primary_rank = clusterGetFailedPrimaryRank(); + if (new_failed_primary_rank != server.cluster->failover_failed_primary_rank) { + long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * 500; + server.cluster->failover_auth_time += added_delay; + server.cluster->failover_failed_primary_rank = new_failed_primary_rank; + serverLog(LL_NOTICE, "Failed primary rank updated to #%d, added %lld milliseconds of delay.", + new_failed_primary_rank, added_delay); + } } /* Return ASAP if we can't still start the election. */ @@ -4686,8 +4856,8 @@ void clusterHandleReplicaFailover(void) { if (server.cluster->failover_auth_sent == 0) { server.cluster->currentEpoch++; server.cluster->failover_auth_epoch = server.cluster->currentEpoch; - serverLog(LL_NOTICE, "Starting a failover election for epoch %llu.", - (unsigned long long)server.cluster->currentEpoch); + serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu", + (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself)); clusterRequestFailoverAuth(); server.cluster->failover_auth_sent = 1; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); @@ -4925,10 +5095,22 @@ void clusterHandleManualFailover(void) { * CLUSTER cron job * -------------------------------------------------------------------------- */ +static mstime_t getHandshakeTimeout(void) { + /* The handshake timeout is the time after which a handshake node that was + * not turned into a normal node is removed from the nodes. Usually it is + * just the cluster_node_timeout value, but when cluster_node_timeout is + * too small we use the value of 1 second. */ + return max(server.cluster_node_timeout, 1000); +} + +static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now) { + return now - node->ctime > getHandshakeTimeout() ? 1 : 0; +} + /* Check if the node is disconnected and re-establish the connection. * Also update a few stats while we are here, that can be used to make * better decisions in other part of the code. */ -static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) { +static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t now) { /* Not interested in reconnecting the link with myself or nodes * for which we have no address. */ if (node->flags & (CLUSTER_NODE_MYSELF | CLUSTER_NODE_NOADDR)) return 1; @@ -4937,12 +5119,24 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ - if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { - serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d after %lldms", node->ip, - node->cport, handshake_timeout); + if (nodeInHandshake(node) && nodeExceedsHandshakeTimeout(node, now)) { + serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d", node->ip, node->cport); clusterDelNode(node); return 1; } + if (nodeInNormalState(node) && node->link != NULL && node->inbound_link == NULL && + now - node->inbound_link_freed_time > getHandshakeTimeout() && + now - node->meet_sent > getHandshakeTimeout()) { + /* Node has an outbound link, but no inbound link for more than the handshake timeout. + * This probably means this node does not know us yet, whereas we know it. + * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. + * We make sure to not re-send a MEET packet more than once every handshake timeout period, so as to + * leave the other node time to complete the handshake. */ + node->flags |= CLUSTER_NODE_MEET; + serverLog(LL_NOTICE, "Sending MEET packet to node %.40s (%s) because there is no inbound link for it", + node->name, node->human_nodename); + clusterSendPing(node->link, CLUSTERMSG_TYPE_MEET); + } if (node->link == NULL) { clusterLink *link = createClusterLink(node); @@ -5001,19 +5195,11 @@ void clusterCron(void) { mstime_t min_pong = 0, now = mstime(); clusterNode *min_pong_node = NULL; static unsigned long long iteration = 0; - mstime_t handshake_timeout; iteration++; /* Number of times this function was called so far. */ clusterUpdateMyselfHostname(); - /* The handshake timeout is the time after which a handshake node that was - * not turned into a normal node is removed from the nodes. Usually it is - * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use - * the value of 1 second. */ - handshake_timeout = server.cluster_node_timeout; - if (handshake_timeout < 1000) handshake_timeout = 1000; - /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */ server.cluster->stats_pfail_nodes = 0; /* Run through some of the operations we want to do on each cluster node. */ @@ -5026,7 +5212,7 @@ void clusterCron(void) { /* The protocol is that function(s) below return non-zero if the node was * terminated. */ - if (clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue; + if (clusterNodeCronHandleReconnect(node, now)) continue; } dictReleaseIterator(di); @@ -5140,7 +5326,7 @@ void clusterCron(void) { if (!(node->flags & (CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) { node->flags |= CLUSTER_NODE_PFAIL; update_state = 1; - if (server.cluster->size == 1 && clusterNodeIsVotingPrimary(myself)) { + if (clusterNodeIsVotingPrimary(myself)) { markNodeAsFailingIfNeeded(node); } else { serverLog(LL_NOTICE, "NODE %.40s (%s) possibly failing.", node->name, node->human_nodename); @@ -6161,7 +6347,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) { kvstoreHashtableIterator *kvs_di = NULL; void *next; - kvs_di = kvstoreGetHashtableSafeIterator(server.db->keys, hashslot); + kvs_di = kvstoreGetHashtableIterator(server.db->keys, hashslot, HASHTABLE_ITER_SAFE); while (kvstoreHashtableIteratorNext(kvs_di, &next)) { robj *valkey = next; enterExecutionUnit(1, 0); @@ -6497,7 +6683,7 @@ void clusterCommandSetSlot(client *c) { * replication, it would also unlikely win the election. * * And 0x702ff is 7.2.255, we only support new versions in this case. */ - if (r->repl_state == REPLICA_STATE_ONLINE && r->replica_version > 0x702ff) { + if (r->repl_data->repl_state == REPLICA_STATE_ONLINE && r->repl_data->replica_version > 0x702ff) { num_eligible_replicas++; } } diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 5595402a4d..226842c5dc 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -61,12 +61,14 @@ typedef struct clusterLink { #define nodeIsPrimary(n) ((n)->flags & CLUSTER_NODE_PRIMARY) #define nodeIsReplica(n) ((n)->flags & CLUSTER_NODE_REPLICA) #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) +#define nodeInMeetState(n) ((n)->flags & CLUSTER_NODE_MEET) #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) #define nodeSupportsExtensions(n) ((n)->flags & CLUSTER_NODE_EXTENSIONS_SUPPORTED) #define nodeSupportsLightMsgHdr(n) ((n)->flags & CLUSTER_NODE_LIGHT_HDR_SUPPORTED) +#define nodeInNormalState(n) (!((n)->flags & (CLUSTER_NODE_HANDSHAKE | CLUSTER_NODE_MEET | CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) /* This structure represent elements of node->fail_reports. */ typedef struct clusterNodeFailReport { @@ -338,11 +340,12 @@ struct _clusterNode { mstime_t ping_sent; /* Unix time we sent latest ping */ mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ + mstime_t meet_sent; /* Unix time we sent latest meet packet */ mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a replica of this primary in non manual - * failover scenarios. */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ + mstime_t inbound_link_freed_time; /* Last time we freed the inbound link for this node. + If it was never freed, it is the same as ctime */ long long repl_offset; /* Last known repl offset for this node. */ char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ sds announce_client_ipv4; /* IPv4 for clients only. */ @@ -379,13 +382,14 @@ struct clusterState { clusterNode *importing_slots_from[CLUSTER_SLOTS]; clusterNode *slots[CLUSTER_SLOTS]; /* The following fields are used to take the replica state on elections. */ - mstime_t failover_auth_time; /* Time of previous or next election. */ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - int failover_auth_rank; /* This replica rank for current auth request. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - int cant_failover_reason; /* Why a replica is currently not able to - failover. See the CANT_FAILOVER_* macros. */ + mstime_t failover_auth_time; /* Time of previous or next election. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + int failover_auth_rank; /* This replica rank for current auth request. */ + int failover_failed_primary_rank; /* The rank of this instance in the context of all failed primary list. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + int cant_failover_reason; /* Why a replica is currently not able to + * failover. See the CANT_FAILOVER_* macros. */ /* Manual failover state in common. */ mstime_t mf_end; /* Manual failover time limit (ms unixtime). It is zero if there is no MF in progress. */ diff --git a/src/commandlog.c b/src/commandlog.c new file mode 100644 index 0000000000..cf25cbf2c2 --- /dev/null +++ b/src/commandlog.c @@ -0,0 +1,265 @@ +/* Commandlog implements a system that is able to remember the latest N + * queries that took more than M microseconds to execute, or consumed + * too much network bandwidth and memory for input/output buffers. + * + * The execution time to reach to be logged in the slow log is set + * using the 'commandlog-execution-slower-than' config directive, that is also + * readable and writable using the CONFIG SET/GET command. + * + * Other configurations such as `commandlog-request-larger-than` and + * `commandlog-reply-larger-than` can be found with more detailed + * explanations in the config file. + * + * The command log is actually not "logged" in the server log file + * but is accessible thanks to the COMMANDLOG command. + * + * ---------------------------------------------------------------------------- + * + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "commandlog.h" +#include "script.h" + +/* Create a new commandlog entry. + * Incrementing the ref count of all the objects retained is up to + * this function. */ +static commandlogEntry *commandlogCreateEntry(client *c, robj **argv, int argc, long long value, int type) { + commandlogEntry *ce = zmalloc(sizeof(*ce)); + int j, ceargc = argc; + + if (ceargc > COMMANDLOG_ENTRY_MAX_ARGC) ceargc = COMMANDLOG_ENTRY_MAX_ARGC; + ce->argc = ceargc; + ce->argv = zmalloc(sizeof(robj *) * ceargc); + for (j = 0; j < ceargc; j++) { + /* Logging too many arguments is a useless memory waste, so we stop + * at COMMANDLOG_ENTRY_MAX_ARGC, but use the last argument to specify + * how many remaining arguments there were in the original command. */ + if (ceargc != argc && j == ceargc - 1) { + ce->argv[j] = + createObject(OBJ_STRING, sdscatprintf(sdsempty(), "... (%d more arguments)", argc - ceargc + 1)); + } else { + /* Trim too long strings as well... */ + if (argv[j]->type == OBJ_STRING && sdsEncodedObject(argv[j]) && + sdslen(argv[j]->ptr) > COMMANDLOG_ENTRY_MAX_STRING) { + sds s = sdsnewlen(argv[j]->ptr, COMMANDLOG_ENTRY_MAX_STRING); + + s = sdscatprintf(s, "... (%lu more bytes)", + (unsigned long)sdslen(argv[j]->ptr) - COMMANDLOG_ENTRY_MAX_STRING); + ce->argv[j] = createObject(OBJ_STRING, s); + } else if (argv[j]->refcount == OBJ_SHARED_REFCOUNT) { + ce->argv[j] = argv[j]; + } else { + /* Here we need to duplicate the string objects composing the + * argument vector of the command, because those may otherwise + * end shared with string objects stored into keys. Having + * shared objects between any part of the server, and the data + * structure holding the data, is a problem: FLUSHALL ASYNC + * may release the shared string object and create a race. */ + ce->argv[j] = dupStringObject(argv[j]); + } + } + } + ce->time = time(NULL); + ce->value = value; + ce->id = server.commandlog[type].entry_id++; + ce->peerid = sdsnew(getClientPeerId(c)); + ce->cname = c->name ? sdsnew(c->name->ptr) : sdsempty(); + return ce; +} + +/* Free a command log entry. The argument is void so that the prototype of this + * function matches the one of the 'free' method of adlist.c. + * + * This function will take care to release all the retained object. */ +static void commandlogFreeEntry(void *ceptr) { + commandlogEntry *ce = ceptr; + int j; + + for (j = 0; j < ce->argc; j++) decrRefCount(ce->argv[j]); + zfree(ce->argv); + sdsfree(ce->peerid); + sdsfree(ce->cname); + zfree(ce); +} + +/* Initialize the command log. This function should be called a single time + * at server startup. */ +void commandlogInit(void) { + for (int i = 0; i < COMMANDLOG_TYPE_NUM; i++) { + server.commandlog[i].entries = listCreate(); + server.commandlog[i].entry_id = 0; + listSetFreeMethod(server.commandlog[i].entries, commandlogFreeEntry); + } +} + +/* Push a new entry into the command log. + * This function will make sure to trim the command log accordingly to the + * configured max length. */ +static void commandlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long value, int type) { + if (server.commandlog[type].threshold < 0 || server.commandlog[type].max_len == 0) return; /* The corresponding commandlog disabled */ + if (value >= server.commandlog[type].threshold) + listAddNodeHead(server.commandlog[type].entries, commandlogCreateEntry(c, argv, argc, value, type)); + + /* Remove old entries if needed. */ + while (listLength(server.commandlog[type].entries) > server.commandlog[type].max_len) listDelNode(server.commandlog[type].entries, listLast(server.commandlog[type].entries)); +} + +/* Remove all the entries from the current command log of the specified type. */ +static void commandlogReset(int type) { + while (listLength(server.commandlog[type].entries) > 0) listDelNode(server.commandlog[type].entries, listLast(server.commandlog[type].entries)); +} + +/* Reply command logs to client. */ +static void commandlogGetReply(client *c, int type, long count) { + listIter li; + listNode *ln; + commandlogEntry *ce; + + if (count > (long)listLength(server.commandlog[type].entries)) { + count = listLength(server.commandlog[type].entries); + } + addReplyArrayLen(c, count); + listRewind(server.commandlog[type].entries, &li); + while (count--) { + int j; + + ln = listNext(&li); + ce = ln->value; + addReplyArrayLen(c, 6); + addReplyLongLong(c, ce->id); + addReplyLongLong(c, ce->time); + addReplyLongLong(c, ce->value); + addReplyArrayLen(c, ce->argc); + for (j = 0; j < ce->argc; j++) addReplyBulk(c, ce->argv[j]); + addReplyBulkCBuffer(c, ce->peerid, sdslen(ce->peerid)); + addReplyBulkCBuffer(c, ce->cname, sdslen(ce->cname)); + } +} + +/* Log the last command a client executed into the commandlog. */ +void commandlogPushCurrentCommand(client *c, struct serverCommand *cmd) { + /* Some commands may contain sensitive data that should not be available in the commandlog. + */ + if (cmd->flags & CMD_SKIP_COMMANDLOG) return; + + /* If command argument vector was rewritten, use the original + * arguments. */ + robj **argv = c->original_argv ? c->original_argv : c->argv; + int argc = c->original_argv ? c->original_argc : c->argc; + + /* If a script is currently running, the client passed in is a + * fake client. Or the client passed in is the original client + * if this is a EVAL or alike, doesn't matter. In this case, + * use the original client to get the client information. */ + c = scriptIsRunning() ? scriptGetCaller() : c; + + commandlogPushEntryIfNeeded(c, argv, argc, c->duration, COMMANDLOG_TYPE_SLOW); + commandlogPushEntryIfNeeded(c, argv, argc, c->net_input_bytes_curr_cmd, COMMANDLOG_TYPE_LARGE_REQUEST); + commandlogPushEntryIfNeeded(c, argv, argc, c->net_output_bytes_curr_cmd, COMMANDLOG_TYPE_LARGE_REPLY); +} + +/* The SLOWLOG command. Implements all the subcommands needed to handle the + * slow log. */ +void slowlogCommand(client *c) { + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { + const char *help[] = { + "GET []", + " Return top entries from the slowlog (default: 10, -1 mean all).", + " Entries are made of:", + " id, timestamp, time in microseconds, arguments array, client IP and port,", + " client name", + "LEN", + " Return the length of the slowlog.", + "RESET", + " Reset the slowlog.", + NULL, + }; + addReplyHelp(c, help); + } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "reset")) { + commandlogReset(COMMANDLOG_TYPE_SLOW); + addReply(c, shared.ok); + } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "len")) { + addReplyLongLong(c, listLength(server.commandlog[COMMANDLOG_TYPE_SLOW].entries)); + } else if ((c->argc == 2 || c->argc == 3) && !strcasecmp(c->argv[1]->ptr, "get")) { + long count = 10; + + if (c->argc == 3) { + /* Consume count arg. */ + if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count, + "count should be greater than or equal to -1") != C_OK) + return; + + if (count == -1) { + /* We treat -1 as a special value, which means to get all slow logs. + * Simply set count to the length of server.commandlog. */ + count = listLength(server.commandlog[COMMANDLOG_TYPE_SLOW].entries); + } + } + + commandlogGetReply(c, COMMANDLOG_TYPE_SLOW, count); + } else { + addReplySubcommandSyntaxError(c); + } +} + +static int commandlogGetTypeOrReply(client *c, robj *o) { + if (!strcasecmp(o->ptr, "slow")) return COMMANDLOG_TYPE_SLOW; + if (!strcasecmp(o->ptr, "large-request")) return COMMANDLOG_TYPE_LARGE_REQUEST; + if (!strcasecmp(o->ptr, "large-reply")) return COMMANDLOG_TYPE_LARGE_REPLY; + addReplyError(c, "type should be one of the following: slow, large-request, large-reply"); + return -1; +} + +/* The COMMANDLOG command. Implements all the subcommands needed to handle the + * command log. */ +void commandlogCommand(client *c) { + int type; + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { + const char *help[] = { + "GET ", + " Return top entries of the specified from the commandlog (-1 mean all).", + " Entries are made of:", + " id, timestamp,", + " time in microseconds for type of slow,", + " or size in bytes for type of large-request,", + " or size in bytes for type of large-reply", + " arguments array, client IP and port,", + " client name", + "LEN ", + " Return the length of the specified type of commandlog.", + "RESET ", + " Reset the specified type of commandlog.", + NULL, + }; + addReplyHelp(c, help); + } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr, "reset")) { + if ((type = commandlogGetTypeOrReply(c, c->argv[2])) == -1) return; + commandlogReset(type); + addReply(c, shared.ok); + } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr, "len")) { + if ((type = commandlogGetTypeOrReply(c, c->argv[2])) == -1) return; + addReplyLongLong(c, listLength(server.commandlog[type].entries)); + } else if (c->argc == 4 && !strcasecmp(c->argv[1]->ptr, "get")) { + long count; + + /* Consume count arg. */ + if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count, + "count should be greater than or equal to -1") != C_OK) + return; + + if ((type = commandlogGetTypeOrReply(c, c->argv[3])) == -1) return; + + if (count == -1) { + /* We treat -1 as a special value, which means to get all command logs. + * Simply set count to the length of server.commandlog. */ + count = listLength(server.commandlog[type].entries); + } + + commandlogGetReply(c, type, count); + } else { + addReplySubcommandSyntaxError(c); + } +} diff --git a/src/slowlog.h b/src/commandlog.h similarity index 71% rename from src/slowlog.h rename to src/commandlog.h index 12d9097ffa..825014746e 100644 --- a/src/slowlog.h +++ b/src/commandlog.h @@ -27,27 +27,26 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __SLOWLOG_H__ -#define __SLOWLOG_H__ +#ifndef __COMMANDLOG_H__ +#define __COMMANDLOG_H__ #include "server.h" -#define SLOWLOG_ENTRY_MAX_ARGC 32 -#define SLOWLOG_ENTRY_MAX_STRING 128 +#define COMMANDLOG_ENTRY_MAX_ARGC 32 +#define COMMANDLOG_ENTRY_MAX_STRING 128 -/* This structure defines an entry inside the slow log list */ -typedef struct slowlogEntry { +/* This structure defines an entry inside the command log list */ +typedef struct commandlogEntry { robj **argv; int argc; - long long id; /* Unique entry identifier. */ - long long duration; /* Time spent by the query, in microseconds. */ - time_t time; /* Unix time at which the query was executed. */ - sds cname; /* Client name. */ - sds peerid; /* Client network address. */ -} slowlogEntry; + long long id; /* Unique entry identifier. */ + long long value; /* The meaning is determined by the type of command log. */ + time_t time; /* Unix time at which the query was executed. */ + sds cname; /* Client name. */ + sds peerid; /* Client network address. */ +} commandlogEntry; /* Exported API */ -void slowlogInit(void); -void slowlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long duration); +void commandlogInit(void); -#endif /* __SLOWLOG_H__ */ +#endif /* __COMMANDLOG_H__ */ diff --git a/src/commands.def b/src/commands.def index f03e44db9f..613eb16c9b 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1289,6 +1289,7 @@ commandHistory CLIENT_KILL_History[] = { {"6.2.0","`LADDR` option."}, {"8.0.0","`MAXAGE` option."}, {"8.0.0","Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."}, +{"8.1.0","`ID` option accepts multiple IDs."}, }; #endif @@ -1320,7 +1321,7 @@ struct COMMAND_ARG CLIENT_KILL_filter_new_format_skipme_Subargs[] = { /* CLIENT KILL filter new_format argument table */ struct COMMAND_ARG CLIENT_KILL_filter_new_format_Subargs[] = { -{MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"2.8.12",CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"2.8.12",CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,0,NULL)}, {MAKE_ARG("client-type",ARG_TYPE_ONEOF,-1,"TYPE",NULL,"2.8.12",CMD_ARG_OPTIONAL,6,NULL),.subargs=CLIENT_KILL_filter_new_format_client_type_Subargs}, {MAKE_ARG("username",ARG_TYPE_STRING,-1,"USER",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, {MAKE_ARG("addr",ARG_TYPE_STRING,-1,"ADDR",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, @@ -1352,6 +1353,7 @@ commandHistory CLIENT_LIST_History[] = { {"7.0.0","Added `resp`, `multi-mem`, `rbs` and `rbp` fields."}, {"7.0.3","Added `ssub` field."}, {"8.0.0","Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."}, +{"8.1.0","Added filters USER, ADDR, LADDR, SKIPME, and MAXAGE"}, }; #endif @@ -1375,10 +1377,21 @@ struct COMMAND_ARG CLIENT_LIST_client_type_Subargs[] = { {MAKE_ARG("pubsub",ARG_TYPE_PURE_TOKEN,-1,"PUBSUB",NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/* CLIENT LIST skipme argument table */ +struct COMMAND_ARG CLIENT_LIST_skipme_Subargs[] = { +{MAKE_ARG("yes",ARG_TYPE_PURE_TOKEN,-1,"YES",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("no",ARG_TYPE_PURE_TOKEN,-1,"NO",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + /* CLIENT LIST argument table */ struct COMMAND_ARG CLIENT_LIST_Args[] = { {MAKE_ARG("client-type",ARG_TYPE_ONEOF,-1,"TYPE",NULL,"5.0.0",CMD_ARG_OPTIONAL,4,NULL),.subargs=CLIENT_LIST_client_type_Subargs}, {MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"6.2.0",CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,0,NULL)}, +{MAKE_ARG("username",ARG_TYPE_STRING,-1,"USER",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("addr",ARG_TYPE_STRING,-1,"ADDR",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, +{MAKE_ARG("laddr",ARG_TYPE_STRING,-1,"LADDR",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, +{MAKE_ARG("skipme",ARG_TYPE_ONEOF,-1,"SKIPME",NULL,"8.1.0",CMD_ARG_OPTIONAL,2,NULL),.subargs=CLIENT_LIST_skipme_Subargs}, +{MAKE_ARG("maxage",ARG_TYPE_INTEGER,-1,"MAXAGE",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL)}, }; /********** CLIENT NO_EVICT ********************/ @@ -1652,26 +1665,26 @@ struct COMMAND_ARG CLIENT_UNBLOCK_Args[] = { /* CLIENT command table */ struct COMMAND_STRUCT CLIENT_Subcommands[] = { -{MAKE_CMD("caching","Instructs the server whether to track the keys in the next request.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CACHING_History,0,CLIENT_CACHING_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_CACHING_Keyspecs,0,NULL,1),.args=CLIENT_CACHING_Args}, -{MAKE_CMD("capa","A client claims its capability.","O(1)","8.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CAPA_History,0,CLIENT_CAPA_Tips,0,clientCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_CAPA_Keyspecs,0,NULL,1),.args=CLIENT_CAPA_Args}, -{MAKE_CMD("getname","Returns the name of the connection.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETNAME_History,0,CLIENT_GETNAME_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETNAME_Keyspecs,0,NULL,0)}, -{MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)}, -{MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, -{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args}, -{MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, -{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,7,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, -{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,7,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, -{MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args}, -{MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args}, -{MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args}, -{MAKE_CMD("reply","Instructs the server whether to reply to commands.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_REPLY_History,0,CLIENT_REPLY_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_REPLY_Keyspecs,0,NULL,1),.args=CLIENT_REPLY_Args}, +{MAKE_CMD("caching","Instructs the server whether to track the keys in the next request.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CACHING_History,0,CLIENT_CACHING_Tips,0,clientCachingCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_CACHING_Keyspecs,0,NULL,1),.args=CLIENT_CACHING_Args}, +{MAKE_CMD("capa","A client claims its capability.","O(1)","8.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CAPA_History,0,CLIENT_CAPA_Tips,0,clientCapaCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_CAPA_Keyspecs,0,NULL,1),.args=CLIENT_CAPA_Args}, +{MAKE_CMD("getname","Returns the name of the connection.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETNAME_History,0,CLIENT_GETNAME_Tips,0,clientGetNameCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETNAME_Keyspecs,0,NULL,0)}, +{MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientGetredirCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)}, +{MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientHelpCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, +{MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientIDCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, +{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientImportSourceCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args}, +{MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientInfoCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, +{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,8,CLIENT_KILL_Tips,0,clientKillCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, +{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,8,CLIENT_LIST_Tips,1,clientListCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,7),.args=CLIENT_LIST_Args}, +{MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientNoEvictCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args}, +{MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientNoTouchCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args}, +{MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientPauseCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args}, +{MAKE_CMD("reply","Instructs the server whether to reply to commands.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_REPLY_History,0,CLIENT_REPLY_Tips,0,clientReplyCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_REPLY_Keyspecs,0,NULL,1),.args=CLIENT_REPLY_Args}, {MAKE_CMD("setinfo","Sets information specific to the client or connection.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETINFO_History,0,CLIENT_SETINFO_Tips,2,clientSetinfoCommand,4,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETINFO_Keyspecs,0,NULL,1),.args=CLIENT_SETINFO_Args}, -{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,2,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args}, -{MAKE_CMD("tracking","Controls server-assisted client-side caching for the connection.","O(1). Some options may introduce additional complexity.","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKING_History,0,CLIENT_TRACKING_Tips,0,clientCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKING_Keyspecs,0,NULL,7),.args=CLIENT_TRACKING_Args}, -{MAKE_CMD("trackinginfo","Returns information about server-assisted client-side caching for the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKINGINFO_History,0,CLIENT_TRACKINGINFO_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKINGINFO_Keyspecs,0,NULL,0)}, -{MAKE_CMD("unblock","Unblocks a client blocked by a blocking command from a different connection.","O(log N) where N is the number of client connections","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNBLOCK_History,0,CLIENT_UNBLOCK_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNBLOCK_Keyspecs,0,NULL,2),.args=CLIENT_UNBLOCK_Args}, -{MAKE_CMD("unpause","Resumes processing commands from paused clients.","O(N) Where N is the number of paused clients","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNPAUSE_History,0,CLIENT_UNPAUSE_Tips,0,clientCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNPAUSE_Keyspecs,0,NULL,0)}, +{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,2,clientSetNameCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args}, +{MAKE_CMD("tracking","Controls server-assisted client-side caching for the connection.","O(1). Some options may introduce additional complexity.","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKING_History,0,CLIENT_TRACKING_Tips,0,clientTrackingCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKING_Keyspecs,0,NULL,7),.args=CLIENT_TRACKING_Args}, +{MAKE_CMD("trackinginfo","Returns information about server-assisted client-side caching for the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKINGINFO_History,0,CLIENT_TRACKINGINFO_Tips,0,clientTrackingInfoCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKINGINFO_Keyspecs,0,NULL,0)}, +{MAKE_CMD("unblock","Unblocks a client blocked by a blocking command from a different connection.","O(log N) where N is the number of client connections","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNBLOCK_History,0,CLIENT_UNBLOCK_Tips,0,clientUnblockCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNBLOCK_Keyspecs,0,NULL,2),.args=CLIENT_UNBLOCK_Args}, +{MAKE_CMD("unpause","Resumes processing commands from paused clients.","O(N) Where N is the number of paused clients","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNPAUSE_History,0,CLIENT_UNPAUSE_Tips,0,clientUnpauseCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNPAUSE_Keyspecs,0,NULL,0)}, {0} }; @@ -1719,6 +1732,7 @@ struct COMMAND_ARG ECHO_Args[] = { #ifndef SKIP_CMD_HISTORY_TABLE /* HELLO history */ commandHistory HELLO_History[] = { +{"8.1.0","A new `availability_zone` field is added to the response if the `availability-zone` config is set."}, {"6.2.0","`protover` made optional; when called without arguments the command reports the current connection's context."}, }; #endif @@ -6654,6 +6668,147 @@ const char *COMMAND_Tips[] = { #define COMMAND_Keyspecs NULL #endif +/********** COMMANDLOG GET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG GET history */ +#define COMMANDLOG_GET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG GET tips */ +const char *COMMANDLOG_GET_Tips[] = { +"request_policy:all_nodes", +"nondeterministic_output", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG GET key specs */ +#define COMMANDLOG_GET_Keyspecs NULL +#endif + +/* COMMANDLOG GET type argument table */ +struct COMMAND_ARG COMMANDLOG_GET_type_Subargs[] = { +{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* COMMANDLOG GET argument table */ +struct COMMAND_ARG COMMANDLOG_GET_Args[] = { +{MAKE_ARG("count",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_GET_type_Subargs}, +}; + +/********** COMMANDLOG HELP ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG HELP history */ +#define COMMANDLOG_HELP_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG HELP tips */ +#define COMMANDLOG_HELP_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG HELP key specs */ +#define COMMANDLOG_HELP_Keyspecs NULL +#endif + +/********** COMMANDLOG LEN ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG LEN history */ +#define COMMANDLOG_LEN_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG LEN tips */ +const char *COMMANDLOG_LEN_Tips[] = { +"request_policy:all_nodes", +"response_policy:agg_sum", +"nondeterministic_output", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG LEN key specs */ +#define COMMANDLOG_LEN_Keyspecs NULL +#endif + +/* COMMANDLOG LEN type argument table */ +struct COMMAND_ARG COMMANDLOG_LEN_type_Subargs[] = { +{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* COMMANDLOG LEN argument table */ +struct COMMAND_ARG COMMANDLOG_LEN_Args[] = { +{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_LEN_type_Subargs}, +}; + +/********** COMMANDLOG RESET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG RESET history */ +#define COMMANDLOG_RESET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG RESET tips */ +const char *COMMANDLOG_RESET_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG RESET key specs */ +#define COMMANDLOG_RESET_Keyspecs NULL +#endif + +/* COMMANDLOG RESET type argument table */ +struct COMMAND_ARG COMMANDLOG_RESET_type_Subargs[] = { +{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* COMMANDLOG RESET argument table */ +struct COMMAND_ARG COMMANDLOG_RESET_Args[] = { +{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_RESET_type_Subargs}, +}; + +/* COMMANDLOG command table */ +struct COMMAND_STRUCT COMMANDLOG_Subcommands[] = { +{MAKE_CMD("get","Returns the specified command log's entries.","O(N) where N is the number of entries returned","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_GET_History,0,COMMANDLOG_GET_Tips,2,commandlogCommand,4,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_GET_Keyspecs,0,NULL,2),.args=COMMANDLOG_GET_Args}, +{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_HELP_History,0,COMMANDLOG_HELP_Tips,0,commandlogCommand,2,CMD_LOADING|CMD_STALE,0,COMMANDLOG_HELP_Keyspecs,0,NULL,0)}, +{MAKE_CMD("len","Returns the number of entries in the specified type of command log.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_LEN_History,0,COMMANDLOG_LEN_Tips,3,commandlogCommand,3,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_LEN_Keyspecs,0,NULL,1),.args=COMMANDLOG_LEN_Args}, +{MAKE_CMD("reset","Clears all entries from the specified type of command log.","O(N) where N is the number of entries in the commandlog","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_RESET_History,0,COMMANDLOG_RESET_Tips,2,commandlogCommand,3,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_RESET_Keyspecs,0,NULL,1),.args=COMMANDLOG_RESET_Args}, +{0} +}; + +/********** COMMANDLOG ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG history */ +#define COMMANDLOG_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG tips */ +#define COMMANDLOG_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG key specs */ +#define COMMANDLOG_Keyspecs NULL +#endif + /********** CONFIG GET ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -7817,10 +7972,10 @@ const char *SLOWLOG_RESET_Tips[] = { /* SLOWLOG command table */ struct COMMAND_STRUCT SLOWLOG_Subcommands[] = { -{MAKE_CMD("get","Returns the slow log's entries.","O(N) where N is the number of entries returned","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_GET_History,1,SLOWLOG_GET_Tips,2,slowlogCommand,-2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_GET_Keyspecs,0,NULL,1),.args=SLOWLOG_GET_Args}, -{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_HELP_History,0,SLOWLOG_HELP_Tips,0,slowlogCommand,2,CMD_LOADING|CMD_STALE,0,SLOWLOG_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("len","Returns the number of entries in the slow log.","O(1)","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_LEN_History,0,SLOWLOG_LEN_Tips,3,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_LEN_Keyspecs,0,NULL,0)}, -{MAKE_CMD("reset","Clears all entries from the slow log.","O(N) where N is the number of entries in the slowlog","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_RESET_History,0,SLOWLOG_RESET_Tips,2,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_RESET_Keyspecs,0,NULL,0)}, +{MAKE_CMD("get","Returns the slow log's entries.","O(N) where N is the number of entries returned","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG GET SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_GET_History,1,SLOWLOG_GET_Tips,2,slowlogCommand,-2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_GET_Keyspecs,0,NULL,1),.args=SLOWLOG_GET_Args}, +{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","6.2.0",CMD_DOC_DEPRECATED,"`COMMANDLOG HELP`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_HELP_History,0,SLOWLOG_HELP_Tips,0,slowlogCommand,2,CMD_LOADING|CMD_STALE,0,SLOWLOG_HELP_Keyspecs,0,NULL,0)}, +{MAKE_CMD("len","Returns the number of entries in the slow log.","O(1)","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG LEN SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_LEN_History,0,SLOWLOG_LEN_Tips,3,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_LEN_Keyspecs,0,NULL,0)}, +{MAKE_CMD("reset","Clears all entries from the slow log.","O(N) where N is the number of entries in the slowlog","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG RESET SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_RESET_History,0,SLOWLOG_RESET_Tips,2,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_RESET_Keyspecs,0,NULL,0)}, {0} }; @@ -10909,9 +11064,9 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("readwrite","Enables read-write queries for a connection to a Valkey replica node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,READWRITE_History,0,READWRITE_Tips,0,readwriteCommand,1,CMD_FAST|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,READWRITE_Keyspecs,0,NULL,0)}, /* connection */ {MAKE_CMD("auth","Authenticates the connection.","O(N) where N is the number of passwords defined for the user","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,AUTH_History,1,AUTH_Tips,0,authCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,AUTH_Keyspecs,0,NULL,2),.args=AUTH_Args}, -{MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,NULL,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands}, +{MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,clientCommand,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands}, {MAKE_CMD("echo","Returns the given string.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,ECHO_History,0,ECHO_Tips,0,echoCommand,2,CMD_LOADING|CMD_STALE|CMD_FAST,ACL_CATEGORY_CONNECTION,ECHO_Keyspecs,0,NULL,1),.args=ECHO_Args}, -{MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,1,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args}, +{MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,2,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args}, {MAKE_CMD("ping","Returns the server's liveliness response.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,PING_History,0,PING_Tips,2,pingCommand,-1,CMD_FAST|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,PING_Keyspecs,0,NULL,1),.args=PING_Args}, {MAKE_CMD("quit","Closes the connection.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"just closing the connection","7.2.0","connection",COMMAND_GROUP_CONNECTION,QUIT_History,0,QUIT_Tips,0,quitCommand,-1,CMD_ALLOW_BUSY|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH,ACL_CATEGORY_CONNECTION,QUIT_Keyspecs,0,NULL,0)}, {MAKE_CMD("reset","Resets the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,RESET_History,0,RESET_Tips,0,resetCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,RESET_Keyspecs,0,NULL,0)}, @@ -11029,6 +11184,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("bgrewriteaof","Asynchronously rewrites the append-only file to disk.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,BGREWRITEAOF_History,0,BGREWRITEAOF_Tips,0,bgrewriteaofCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT,0,BGREWRITEAOF_Keyspecs,0,NULL,0)}, {MAKE_CMD("bgsave","Asynchronously saves the database(s) to disk.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,BGSAVE_History,2,BGSAVE_Tips,0,bgsaveCommand,-1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT,0,BGSAVE_Keyspecs,0,NULL,1),.args=BGSAVE_Args}, {MAKE_CMD("command","Returns detailed information about all commands.","O(N) where N is the total number of commands","2.8.13",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMAND_History,0,COMMAND_Tips,1,commandCommand,-1,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,COMMAND_Keyspecs,0,NULL,0),.subcommands=COMMAND_Subcommands}, +{MAKE_CMD("commandlog","A container for command log commands.","Depends on subcommand.","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_History,0,COMMANDLOG_Tips,0,NULL,-2,0,0,COMMANDLOG_Keyspecs,0,NULL,0),.subcommands=COMMANDLOG_Subcommands}, {MAKE_CMD("config","A container for server configuration commands.","Depends on subcommand.","2.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_History,0,CONFIG_Tips,0,NULL,-2,0,0,CONFIG_Keyspecs,0,NULL,0),.subcommands=CONFIG_Subcommands}, {MAKE_CMD("dbsize","Returns the number of keys in the database.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,DBSIZE_History,0,DBSIZE_Tips,2,dbsizeCommand,1,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,DBSIZE_Keyspecs,0,NULL,0)}, {MAKE_CMD("debug","A container for debugging commands.","Depends on subcommand.","1.0.0",CMD_DOC_SYSCMD,NULL,NULL,"server",COMMAND_GROUP_SERVER,DEBUG_History,0,DEBUG_Tips,0,debugCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_PROTECTED,0,DEBUG_Keyspecs,0,NULL,0)}, @@ -11050,7 +11206,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("save","Synchronously saves the database(s) to disk.","O(N) where N is the total number of keys in all databases","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SAVE_History,0,SAVE_Tips,0,saveCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_NO_MULTI,0,SAVE_Keyspecs,0,NULL,0)}, {MAKE_CMD("shutdown","Synchronously saves the database(s) to disk and shuts down the server.","O(N) when saving, where N is the total number of keys in all databases when saving data, otherwise O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SHUTDOWN_History,1,SHUTDOWN_Tips,0,shutdownCommand,-1,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_NO_MULTI|CMD_SENTINEL|CMD_ALLOW_BUSY,0,SHUTDOWN_Keyspecs,0,NULL,1),.args=SHUTDOWN_Args}, {MAKE_CMD("slaveof","Sets a server as a replica of another, or promotes it to being a primary.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"`REPLICAOF`","5.0.0","server",COMMAND_GROUP_SERVER,SLAVEOF_History,0,SLAVEOF_Tips,0,replicaofCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_STALE,0,SLAVEOF_Keyspecs,0,NULL,1),.args=SLAVEOF_Args}, -{MAKE_CMD("slowlog","A container for slow log commands.","Depends on subcommand.","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_History,0,SLOWLOG_Tips,0,NULL,-2,0,0,SLOWLOG_Keyspecs,0,NULL,0),.subcommands=SLOWLOG_Subcommands}, +{MAKE_CMD("slowlog","A container for slow log commands.","Depends on subcommand.","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_History,0,SLOWLOG_Tips,0,NULL,-2,0,0,SLOWLOG_Keyspecs,0,NULL,0),.subcommands=SLOWLOG_Subcommands}, {MAKE_CMD("swapdb","Swaps two databases.","O(N) where N is the count of clients watching or blocking on keys from both databases.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SWAPDB_History,0,SWAPDB_Tips,0,swapdbCommand,3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_KEYSPACE|ACL_CATEGORY_DANGEROUS,SWAPDB_Keyspecs,0,NULL,2),.args=SWAPDB_Args}, {MAKE_CMD("sync","An internal command used in replication.",NULL,"1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SYNC_History,0,SYNC_Tips,0,syncCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NO_MULTI|CMD_NOSCRIPT,0,SYNC_Keyspecs,0,NULL,0)}, {MAKE_CMD("time","Returns the server time.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,TIME_History,0,TIME_Tips,1,timeCommand,1,CMD_LOADING|CMD_STALE|CMD_FAST,0,TIME_Keyspecs,0,NULL,0)}, @@ -11149,7 +11305,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("substr","Returns a substring from a string value.","O(N) where N is the length of the returned string. The complexity is ultimately determined by the returned length, but because creating a substring from an existing string is very cheap, it can be considered O(1) for small strings.","1.0.0",CMD_DOC_DEPRECATED,"`GETRANGE`","2.0.0","string",COMMAND_GROUP_STRING,SUBSTR_History,0,SUBSTR_Tips,0,getrangeCommand,4,CMD_READONLY,ACL_CATEGORY_STRING,SUBSTR_Keyspecs,1,NULL,3),.args=SUBSTR_Args}, /* transactions */ {MAKE_CMD("discard","Discards a transaction.","O(N), when N is the number of queued commands","2.0.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,DISCARD_History,0,DISCARD_Tips,0,discardCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,DISCARD_Keyspecs,0,NULL,0)}, -{MAKE_CMD("exec","Executes all commands in a transaction.","Depends on commands in the transaction","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,EXEC_History,0,EXEC_Tips,0,execCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SKIP_SLOWLOG,ACL_CATEGORY_TRANSACTION,EXEC_Keyspecs,0,NULL,0)}, +{MAKE_CMD("exec","Executes all commands in a transaction.","Depends on commands in the transaction","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,EXEC_History,0,EXEC_Tips,0,execCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SKIP_COMMANDLOG,ACL_CATEGORY_TRANSACTION,EXEC_Keyspecs,0,NULL,0)}, {MAKE_CMD("multi","Starts a transaction.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,MULTI_History,0,MULTI_Tips,0,multiCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_MULTI|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,MULTI_Keyspecs,0,NULL,0)}, {MAKE_CMD("unwatch","Forgets about watched keys of a transaction.","O(1)","2.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,UNWATCH_History,0,UNWATCH_Tips,0,unwatchCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,UNWATCH_Keyspecs,0,NULL,0)}, {MAKE_CMD("watch","Monitors changes to keys to determine the execution of a transaction.","O(1) for every key.","2.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,WATCH_History,0,WATCH_Tips,0,watchCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_MULTI|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,WATCH_Keyspecs,1,NULL,1),.args=WATCH_Args}, diff --git a/src/commands/client-caching.json b/src/commands/client-caching.json index 2a4ae891db..d661492f45 100644 --- a/src/commands/client-caching.json +++ b/src/commands/client-caching.json @@ -6,7 +6,7 @@ "since": "6.0.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientCachingCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-capa.json b/src/commands/client-capa.json index 3c16cd44f9..0d0f577f94 100644 --- a/src/commands/client-capa.json +++ b/src/commands/client-capa.json @@ -6,7 +6,7 @@ "since": "8.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientCapaCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-getname.json b/src/commands/client-getname.json index 9e237af849..e13db064b7 100644 --- a/src/commands/client-getname.json +++ b/src/commands/client-getname.json @@ -6,7 +6,7 @@ "since": "2.6.9", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientGetNameCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-getredir.json b/src/commands/client-getredir.json index 6fdb002dc8..3df1df6b6f 100644 --- a/src/commands/client-getredir.json +++ b/src/commands/client-getredir.json @@ -6,7 +6,7 @@ "since": "6.0.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientGetredirCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-help.json b/src/commands/client-help.json index b49294c9ee..ae771d52ae 100644 --- a/src/commands/client-help.json +++ b/src/commands/client-help.json @@ -6,7 +6,7 @@ "since": "5.0.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientHelpCommand", "command_flags": [ "LOADING", "STALE", diff --git a/src/commands/client-id.json b/src/commands/client-id.json index 7c2bf08200..f6131250dd 100644 --- a/src/commands/client-id.json +++ b/src/commands/client-id.json @@ -6,7 +6,7 @@ "since": "5.0.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientIDCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-import-source.json b/src/commands/client-import-source.json index 113c07d70a..dd5ef65e77 100644 --- a/src/commands/client-import-source.json +++ b/src/commands/client-import-source.json @@ -6,7 +6,7 @@ "since": "8.1.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientImportSourceCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-info.json b/src/commands/client-info.json index f974da437b..afda2ca967 100644 --- a/src/commands/client-info.json +++ b/src/commands/client-info.json @@ -6,7 +6,7 @@ "since": "6.2.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientInfoCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-kill.json b/src/commands/client-kill.json index 97fa932cd8..0ae3579534 100644 --- a/src/commands/client-kill.json +++ b/src/commands/client-kill.json @@ -6,7 +6,7 @@ "since": "2.4.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientKillCommand", "history": [ [ "2.8.12", @@ -35,6 +35,10 @@ [ "8.0.0", "Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility." + ], + [ + "8.1.0", + "`ID` option accepts multiple IDs." ] ], "command_flags": [ @@ -68,6 +72,7 @@ "name": "client-id", "type": "integer", "optional": true, + "multiple": true, "since": "2.8.12" }, { diff --git a/src/commands/client-list.json b/src/commands/client-list.json index d9c0054e60..05e4de2419 100644 --- a/src/commands/client-list.json +++ b/src/commands/client-list.json @@ -6,7 +6,7 @@ "since": "2.4.0", "arity": -2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientListCommand", "history": [ [ "2.8.12", @@ -35,6 +35,10 @@ [ "8.0.0", "Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility." + ], + [ + "8.1.0", + "Added filters USER, ADDR, LADDR, SKIPME, and MAXAGE" ] ], "command_flags": [ @@ -91,6 +95,55 @@ "optional": true, "multiple": true, "since": "6.2.0" + }, + { + "token": "USER", + "name": "username", + "type": "string", + "optional": true, + "since": "8.1.0" + }, + { + "token": "ADDR", + "name": "addr", + "display": "ip:port", + "type": "string", + "optional": true, + "since": "8.1.0" + }, + { + "token": "LADDR", + "name": "laddr", + "display": "ip:port", + "type": "string", + "optional": true, + "since": "8.1.0" + }, + { + "token": "SKIPME", + "name": "skipme", + "type": "oneof", + "optional": true, + "since": "8.1.0", + "arguments": [ + { + "name": "yes", + "type": "pure-token", + "token": "YES" + }, + { + "name": "no", + "type": "pure-token", + "token": "NO" + } + ] + }, + { + "token": "MAXAGE", + "name": "maxage", + "type": "integer", + "optional": true, + "since": "8.1.0" } ] } diff --git a/src/commands/client-no-evict.json b/src/commands/client-no-evict.json index 9ed6718405..710f8a97f9 100644 --- a/src/commands/client-no-evict.json +++ b/src/commands/client-no-evict.json @@ -6,7 +6,7 @@ "since": "7.0.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientNoEvictCommand", "command_flags": [ "ADMIN", "NOSCRIPT", diff --git a/src/commands/client-no-touch.json b/src/commands/client-no-touch.json index 4cf7b72416..4196770a2e 100644 --- a/src/commands/client-no-touch.json +++ b/src/commands/client-no-touch.json @@ -6,7 +6,7 @@ "since": "7.2.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientNoTouchCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-pause.json b/src/commands/client-pause.json index b1dd7bc478..54faf796c2 100644 --- a/src/commands/client-pause.json +++ b/src/commands/client-pause.json @@ -6,7 +6,7 @@ "since": "3.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientPauseCommand", "history": [ [ "6.2.0", diff --git a/src/commands/client-reply.json b/src/commands/client-reply.json index 9406de85cf..8d2b713a69 100644 --- a/src/commands/client-reply.json +++ b/src/commands/client-reply.json @@ -6,7 +6,7 @@ "since": "3.2.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientReplyCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-setname.json b/src/commands/client-setname.json index b071bd18ff..f544dc6a0f 100644 --- a/src/commands/client-setname.json +++ b/src/commands/client-setname.json @@ -6,7 +6,7 @@ "since": "2.6.9", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientSetNameCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-tracking.json b/src/commands/client-tracking.json index 2c3768c2fb..1acf84fafc 100644 --- a/src/commands/client-tracking.json +++ b/src/commands/client-tracking.json @@ -6,7 +6,7 @@ "since": "6.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientTrackingCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-trackinginfo.json b/src/commands/client-trackinginfo.json index 270a3d5e6e..78ba8201d7 100644 --- a/src/commands/client-trackinginfo.json +++ b/src/commands/client-trackinginfo.json @@ -6,7 +6,7 @@ "since": "6.2.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientTrackingInfoCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-unblock.json b/src/commands/client-unblock.json index d391ede9e9..2173173f40 100644 --- a/src/commands/client-unblock.json +++ b/src/commands/client-unblock.json @@ -6,7 +6,7 @@ "since": "5.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientUnblockCommand", "command_flags": [ "ADMIN", "NOSCRIPT", diff --git a/src/commands/client-unpause.json b/src/commands/client-unpause.json index 6c55210d2a..bb78fb848b 100644 --- a/src/commands/client-unpause.json +++ b/src/commands/client-unpause.json @@ -6,7 +6,7 @@ "since": "6.2.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientUnpauseCommand", "command_flags": [ "ADMIN", "NOSCRIPT", diff --git a/src/commands/client.json b/src/commands/client.json index b50996128e..116fb4d4a2 100644 --- a/src/commands/client.json +++ b/src/commands/client.json @@ -4,6 +4,7 @@ "complexity": "Depends on subcommand.", "group": "connection", "since": "2.4.0", + "function": "clientCommand", "arity": -2, "command_flags": [ "SENTINEL" diff --git a/src/commands/commandlog-get.json b/src/commands/commandlog-get.json new file mode 100644 index 0000000000..00a5a01b0f --- /dev/null +++ b/src/commands/commandlog-get.json @@ -0,0 +1,85 @@ +{ + "GET": { + "summary": "Returns the specified command log's entries.", + "complexity": "O(N) where N is the number of entries returned", + "group": "server", + "since": "8.1.0", + "arity": 4, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "ADMIN", + "LOADING", + "STALE" + ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "NONDETERMINISTIC_OUTPUT" + ], + "reply_schema": { + "type": "array", + "description": "Entries from the command log in chronological order.", + "uniqueItems": true, + "items": { + "type": "array", + "minItems": 6, + "maxItems": 6, + "items": [ + { + "type": "integer", + "description": "Command log entry ID." + }, + { + "type": "integer", + "description": "The unix timestamp at which the logged command was processed.", + "minimum": 0 + }, + { + "type": "integer", + "description": "Determined by the type parameter.", + "minimum": 0 + }, + { + "type": "array", + "description": "The arguments of the command.", + "items": { + "type": "string" + } + }, + { + "type": "string", + "description": "Client IP address and port." + }, + { + "type": "string", + "description": "Client name if set via the CLIENT SETNAME command." + } + ] + } + }, + "arguments": [ + { + "name": "count", + "type": "integer" + }, + { + "name": "type", + "type": "oneof", + "arguments": [ + { + "name": "slow", + "type": "string" + }, + { + "name": "large-request", + "type": "string" + }, + { + "name": "large-reply", + "type": "string" + } + ] + } + ] + } +} diff --git a/src/commands/commandlog-help.json b/src/commands/commandlog-help.json new file mode 100644 index 0000000000..dacf7d6209 --- /dev/null +++ b/src/commands/commandlog-help.json @@ -0,0 +1,22 @@ +{ + "HELP": { + "summary": "Show helpful text about the different subcommands", + "complexity": "O(1)", + "group": "server", + "since": "8.1.0", + "arity": 2, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "LOADING", + "STALE" + ], + "reply_schema": { + "type": "array", + "description": "Helpful text about subcommands.", + "items": { + "type": "string" + } + } + } +} diff --git a/src/commands/commandlog-len.json b/src/commands/commandlog-len.json new file mode 100644 index 0000000000..4f1bb44075 --- /dev/null +++ b/src/commands/commandlog-len.json @@ -0,0 +1,46 @@ +{ + "LEN": { + "summary": "Returns the number of entries in the specified type of command log.", + "complexity": "O(1)", + "group": "server", + "since": "8.1.0", + "arity": 3, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "ADMIN", + "LOADING", + "STALE" + ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:AGG_SUM", + "NONDETERMINISTIC_OUTPUT" + ], + "reply_schema": { + "type": "integer", + "description": "Number of entries in the command log.", + "minimum": 0 + }, + "arguments": [ + { + "name": "type", + "type": "oneof", + "arguments": [ + { + "name": "slow", + "type": "string" + }, + { + "name": "large-request", + "type": "string" + }, + { + "name": "large-reply", + "type": "string" + } + ] + } + ] + } +} diff --git a/src/commands/commandlog-reset.json b/src/commands/commandlog-reset.json new file mode 100644 index 0000000000..e43c4d4404 --- /dev/null +++ b/src/commands/commandlog-reset.json @@ -0,0 +1,43 @@ +{ + "RESET": { + "summary": "Clears all entries from the specified type of command log.", + "complexity": "O(N) where N is the number of entries in the commandlog", + "group": "server", + "since": "8.1.0", + "arity": 3, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "ADMIN", + "LOADING", + "STALE" + ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], + "reply_schema": { + "const": "OK" + }, + "arguments": [ + { + "name": "type", + "type": "oneof", + "arguments": [ + { + "name": "slow", + "type": "string" + }, + { + "name": "large-request", + "type": "string" + }, + { + "name": "large-reply", + "type": "string" + } + ] + } + ] + } +} diff --git a/src/commands/commandlog.json b/src/commands/commandlog.json new file mode 100644 index 0000000000..2ff2376436 --- /dev/null +++ b/src/commands/commandlog.json @@ -0,0 +1,9 @@ +{ + "COMMANDLOG": { + "summary": "A container for command log commands.", + "complexity": "Depends on subcommand.", + "group": "server", + "since": "8.1.0", + "arity": -2 + } +} diff --git a/src/commands/exec.json b/src/commands/exec.json index 5f03d76e08..3b1b1faff1 100644 --- a/src/commands/exec.json +++ b/src/commands/exec.json @@ -10,7 +10,7 @@ "NOSCRIPT", "LOADING", "STALE", - "SKIP_SLOWLOG" + "SKIP_COMMANDLOG" ], "acl_categories": [ "TRANSACTION" diff --git a/src/commands/hello.json b/src/commands/hello.json index f3fcc5a13c..15fd81c655 100644 --- a/src/commands/hello.json +++ b/src/commands/hello.json @@ -7,6 +7,10 @@ "arity": -1, "function": "helloCommand", "history": [ + [ + "8.1.0", + "A new `availability_zone` field is added to the response if the `availability-zone` config is set." + ], [ "6.2.0", "`protover` made optional; when called without arguments the command reports the current connection's context." diff --git a/src/commands/set.json b/src/commands/set.json index 3d3800f11d..601bd676a2 100644 --- a/src/commands/set.json +++ b/src/commands/set.json @@ -111,14 +111,7 @@ "type": "string", "token": "IFEQ", "since": "8.1.0", - "summary": "Sets the key's value only if the current value matches the specified comparison value.", - "arguments": [ - { - "name": "comparison-value", - "type": "string", - "summary": "The value to compare with the current key's value before setting." - } - ] + "summary": "Sets the key's value only if the current value matches the specified comparison value." } ] }, diff --git a/src/commands/slowlog-get.json b/src/commands/slowlog-get.json index ffc54b5454..3f57b87ed8 100644 --- a/src/commands/slowlog-get.json +++ b/src/commands/slowlog-get.json @@ -7,6 +7,11 @@ "arity": -2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG GET SLOW`", + "doc_flags": [ + "DEPRECATED" + ], "history": [ [ "4.0.0", diff --git a/src/commands/slowlog-help.json b/src/commands/slowlog-help.json index dde8fd4598..1db5520e1f 100644 --- a/src/commands/slowlog-help.json +++ b/src/commands/slowlog-help.json @@ -7,6 +7,11 @@ "arity": 2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG HELP`", + "doc_flags": [ + "DEPRECATED" + ], "command_flags": [ "LOADING", "STALE" diff --git a/src/commands/slowlog-len.json b/src/commands/slowlog-len.json index 717a8ad416..8b4e2b86c8 100644 --- a/src/commands/slowlog-len.json +++ b/src/commands/slowlog-len.json @@ -7,6 +7,11 @@ "arity": 2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG LEN SLOW`", + "doc_flags": [ + "DEPRECATED" + ], "command_flags": [ "ADMIN", "LOADING", diff --git a/src/commands/slowlog-reset.json b/src/commands/slowlog-reset.json index cfc1e4da7f..d5fd4b02a2 100644 --- a/src/commands/slowlog-reset.json +++ b/src/commands/slowlog-reset.json @@ -7,6 +7,11 @@ "arity": 2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG RESET SLOW`", + "doc_flags": [ + "DEPRECATED" + ], "command_flags": [ "ADMIN", "LOADING", diff --git a/src/commands/slowlog.json b/src/commands/slowlog.json index 1b9526b191..0120d55eda 100644 --- a/src/commands/slowlog.json +++ b/src/commands/slowlog.json @@ -4,6 +4,11 @@ "complexity": "Depends on subcommand.", "group": "server", "since": "2.2.12", - "arity": -2 + "arity": -2, + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG`", + "doc_flags": [ + "DEPRECATED" + ] } } diff --git a/src/config.c b/src/config.c index cc0f8d2dd8..de8d00dce0 100644 --- a/src/config.c +++ b/src/config.c @@ -32,6 +32,7 @@ #include "cluster.h" #include "connection.h" #include "bio.h" +#include "module.h" #include #include @@ -283,7 +284,7 @@ struct standardConfig { void *privdata; /* privdata for this config, for module configs this is a ModuleConfig struct */ }; -dict *configs = NULL; /* Runtime config values */ +static dict *configs = NULL; /* Runtime config values */ /* Lookup a config by the provided sds string name, or return NULL * if the config does not exist */ @@ -297,7 +298,7 @@ static standardConfig *lookupConfig(sds name) { *----------------------------------------------------------------------------*/ /* Get enum value from name. If there is no match INT_MIN is returned. */ -int configEnumGetValue(configEnum *ce, sds *argv, int argc, int bitflags) { +static int configEnumGetValue(configEnum *ce, sds *argv, int argc, int bitflags) { if (argc == 0 || (!bitflags && argc != 1)) return INT_MIN; int values = 0; for (int i = 0; i < argc; i++) { @@ -371,20 +372,6 @@ void resetServerSaveParams(void) { server.saveparamslen = 0; } -void queueLoadModule(sds path, sds *argv, int argc) { - int i; - struct moduleLoadQueueEntry *loadmod; - - loadmod = zmalloc(sizeof(struct moduleLoadQueueEntry)); - loadmod->argv = argc ? zmalloc(sizeof(robj *) * argc) : NULL; - loadmod->path = sdsnew(path); - loadmod->argc = argc; - for (i = 0; i < argc; i++) { - loadmod->argv[i] = createRawStringObject(argv[i], sdslen(argv[i])); - } - listAddNodeTail(server.loadmodule_queue, loadmod); -} - /* Parse an array of `arg_len` sds strings, validate and populate * server.client_obuf_limits if valid. * Used in CONFIG SET and configuration file parsing. */ @@ -567,7 +554,7 @@ void loadServerConfigFromString(char *config) { goto loaderr; } } else if (!strcasecmp(argv[0], "loadmodule") && argc >= 2) { - queueLoadModule(argv[1], &argv[2], argc - 2); + moduleEnqueueLoadModule(argv[1], &argv[2], argc - 2); } else if (strchr(argv[0], '.')) { if (argc < 2) { err = "Module config specified without value"; @@ -831,7 +818,7 @@ void configSetCommand(client *c) { /* Note: it's important we run over ALL passed configs and check if we need to call * `redactClientCommandArgument()`. This is in order to avoid anyone using this command for a - * log/slowlog/monitor/etc. displaying sensitive info. So even if we encounter an error we still continue + * log/commandlog/monitor/etc. displaying sensitive info. So even if we encounter an error we still continue * running over the remaining arguments. */ if (config->flags & SENSITIVE_CONFIG) { redactClientCommandArgument(c, 2 + i * 2 + 1); @@ -1583,12 +1570,7 @@ void rewriteConfigLoadmoduleOption(struct rewriteConfigState *state) { dictEntry *de; while ((de = dictNext(di)) != NULL) { struct ValkeyModule *module = dictGetVal(de); - line = sdsnew("loadmodule "); - line = sdscatsds(line, module->loadmod->path); - for (int i = 0; i < module->loadmod->argc; i++) { - line = sdscatlen(line, " ", 1); - line = sdscatsds(line, module->loadmod->argv[i]->ptr); - } + line = moduleLoadQueueEntryToLoadmoduleOptionStr(module, "loadmodule"); rewriteConfigRewriteLine(state, "loadmodule", line, 1); } dictReleaseIterator(di); @@ -3186,7 +3168,7 @@ standardConfig static_configs[] = { createBoolConfig("replica-read-only", "slave-read-only", DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_replica_ro, 1, NULL, NULL), createBoolConfig("replica-ignore-maxmemory", "slave-ignore-maxmemory", MODIFIABLE_CONFIG, server.repl_replica_ignore_maxmemory, 1, NULL, NULL), createBoolConfig("jemalloc-bg-thread", NULL, MODIFIABLE_CONFIG, server.jemalloc_bg_thread, 1, NULL, updateJemallocBgThread), - createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, 0, isValidActiveDefrag, NULL), + createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, CONFIG_ACTIVE_DEFRAG_DEFAULT, isValidActiveDefrag, NULL), createBoolConfig("syslog-enabled", NULL, IMMUTABLE_CONFIG, server.syslog_enabled, 0, NULL, NULL), createBoolConfig("cluster-enabled", NULL, IMMUTABLE_CONFIG, server.cluster_enabled, 0, NULL, NULL), createBoolConfig("appendonly", NULL, MODIFIABLE_CONFIG | DENY_LOADING_CONFIG, server.aof_enabled, 0, NULL, updateAppendonly), @@ -3320,7 +3302,9 @@ standardConfig static_configs[] = { /* Unsigned Long configs */ createULongConfig("active-defrag-max-scan-fields", NULL, MODIFIABLE_CONFIG, 1, LONG_MAX, server.active_defrag_max_scan_fields, 1000, INTEGER_CONFIG, NULL, NULL), /* Default: keys with more than 1000 fields will be processed separately */ - createULongConfig("slowlog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.slowlog_max_len, 128, INTEGER_CONFIG, NULL, NULL), + createULongConfig("commandlog-slow-execution-max-len", "slowlog-max-len", MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_SLOW].max_len, 128, INTEGER_CONFIG, NULL, NULL), + createULongConfig("commandlog-large-request-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REQUEST].max_len, 128, INTEGER_CONFIG, NULL, NULL), + createULongConfig("commandlog-large-reply-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REPLY].max_len, 128, INTEGER_CONFIG, NULL, NULL), createULongConfig("acllog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.acllog_max_len, 128, INTEGER_CONFIG, NULL, NULL), createULongConfig("cluster-blacklist-ttl", NULL, MODIFIABLE_CONFIG, 0, ULONG_MAX, server.cluster_blacklist_ttl, 60, INTEGER_CONFIG, NULL, NULL), @@ -3328,7 +3312,9 @@ standardConfig static_configs[] = { createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL), /* milliseconds */ createLongLongConfig("cluster-node-timeout", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.cluster_node_timeout, 15000, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("cluster-ping-interval", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, LLONG_MAX, server.cluster_ping_interval, 0, INTEGER_CONFIG, NULL, NULL), - createLongLongConfig("slowlog-log-slower-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.slowlog_log_slower_than, 10000, INTEGER_CONFIG, NULL, NULL), + createLongLongConfig("commandlog-execution-slower-than", "slowlog-log-slower-than", MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_SLOW].threshold, 10000, INTEGER_CONFIG, NULL, NULL), + createLongLongConfig("commandlog-request-larger-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REQUEST].threshold, 1024 * 1024, INTEGER_CONFIG, NULL, NULL), + createLongLongConfig("commandlog-reply-larger-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REPLY].threshold, 1024 * 1024, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024 * 1024, LONG_MAX, server.proto_max_bulk_len, 512ll * 1024 * 1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */ createLongLongConfig("stream-node-max-entries", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.stream_node_max_entries, 100, INTEGER_CONFIG, NULL, NULL), @@ -3337,6 +3323,7 @@ standardConfig static_configs[] = { /* Unsigned Long Long configs */ createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory), createULongLongConfig("cluster-link-sendbuf-limit", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.cluster_link_msg_queue_limit_bytes, 0, MEMORY_CONFIG, NULL, NULL), + createULongLongConfig("aof-max-size", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.aof_max_size, 0, INTEGER_CONFIG, NULL, NULL), /* Size_t configs */ createSizeTConfig("hash-max-listpack-entries", "hash-max-ziplist-entries", MODIFIABLE_CONFIG, 0, LONG_MAX, server.hash_max_listpack_entries, 512, INTEGER_CONFIG, NULL, NULL), diff --git a/src/connection.h b/src/connection.h index 8a2775ee34..fd7e0910cf 100644 --- a/src/connection.h +++ b/src/connection.h @@ -54,8 +54,9 @@ typedef enum { CONN_STATE_ERROR } ConnectionState; -#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */ -#define CONN_FLAG_WRITE_BARRIER (1 << 1) /* Write barrier requested */ +#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */ +#define CONN_FLAG_WRITE_BARRIER (1 << 1) /* Write barrier requested */ +#define CONN_FLAG_ALLOW_ACCEPT_OFFLOAD (1 << 2) /* Connection accept can be offloaded to IO threads. */ #define CONN_TYPE_SOCKET "tcp" #define CONN_TYPE_UNIX "unix" diff --git a/src/db.c b/src/db.c index 2bd40ba74b..f2a000030b 100644 --- a/src/db.c +++ b/src/db.c @@ -33,6 +33,7 @@ #include "script.h" #include "functions.h" #include "io_threads.h" +#include "module.h" #include #include @@ -124,7 +125,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { * Don't do it if we have a saving child, as this will trigger * a copy on write madness. */ if (server.current_client && server.current_client->flag.no_touch && - server.current_client->cmd->proc != touchCommand) + server.executing_client->cmd->proc != touchCommand) flags |= LOOKUP_NOTOUCH; if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) { /* Shared objects can't be stored in the database. */ @@ -894,9 +895,9 @@ void keysCommand(client *c) { kvstoreHashtableIterator *kvs_di = NULL; kvstoreIterator *kvs_it = NULL; if (pslot != -1) { - kvs_di = kvstoreGetHashtableSafeIterator(c->db->keys, pslot); + kvs_di = kvstoreGetHashtableIterator(c->db->keys, pslot, HASHTABLE_ITER_SAFE); } else { - kvs_it = kvstoreIteratorInit(c->db->keys); + kvs_it = kvstoreIteratorInit(c->db->keys, HASHTABLE_ITER_SAFE); } void *next; while (kvs_di ? kvstoreHashtableIteratorNext(kvs_di, &next) : kvstoreIteratorNext(kvs_it, &next)) { @@ -978,42 +979,53 @@ void keysScanCallback(void *privdata, void *entry) { /* This callback is used by scanGenericCommand in order to collect elements * returned by the dictionary iterator into a list. */ -void scanCallback(void *privdata, const dictEntry *de) { +void hashtableScanCallback(void *privdata, void *entry) { scanData *data = (scanData *)privdata; - list *keys = data->keys; - robj *o = data->o; sds val = NULL; sds key = NULL; + + robj *o = data->o; + list *keys = data->keys; data->sampled++; /* This callback is only used for scanning elements within a key (hash * fields, set elements, etc.) so o must be set here. */ serverAssert(o != NULL); + /* get key, value */ + if (o->type == OBJ_SET) { + key = (sds)entry; + } else if (o->type == OBJ_ZSET) { + zskiplistNode *node = (zskiplistNode *)entry; + key = node->ele; + /* zset data is copied after filtering by key */ + } else if (o->type == OBJ_HASH) { + key = hashTypeEntryGetField(entry); + if (!data->only_keys) { + val = hashTypeEntryGetValue(entry); + } + } else { + serverPanic("Type not handled in hashtable SCAN callback."); + } + /* Filter element if it does not match the pattern. */ - sds keysds = dictGetKey(de); if (data->pattern) { - if (!stringmatchlen(data->pattern, sdslen(data->pattern), keysds, sdslen(keysds), 0)) { + if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) { return; } } - if (o->type == OBJ_SET) { - key = keysds; - } else if (o->type == OBJ_HASH) { - key = keysds; - if (!data->only_keys) { - val = dictGetVal(de); - } - } else if (o->type == OBJ_ZSET) { - key = sdsdup(keysds); + /* zset data must be copied. Do this after filtering to avoid unneeded + * allocations. */ + if (o->type == OBJ_ZSET) { + /* zset data is copied */ + zskiplistNode *node = (zskiplistNode *)entry; + key = sdsdup(node->ele); if (!data->only_keys) { char buf[MAX_LONG_DOUBLE_CHARS]; - int len = ld2string(buf, sizeof(buf), *(double *)dictGetVal(de), LD_STR_AUTO); + int len = ld2string(buf, sizeof(buf), node->score, LD_STR_AUTO); val = sdsnewlen(buf, len); } - } else { - serverPanic("Type not handled in SCAN callback."); } listAddNodeTail(keys, key); @@ -1083,7 +1095,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { sds typename = NULL; long long type = LLONG_MAX; int patlen = 0, use_pattern = 0, only_keys = 0; - dict *ht; /* Object must be NULL (to iterate keys names), or the type of the object * must be Set, Sorted Set, or Hash. */ @@ -1152,33 +1163,33 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * just return everything inside the object in a single call, setting the * cursor to zero to signal the end of the iteration. */ - /* Handle the case of a hash table. */ - ht = NULL; + /* Handle the case of kvstore, dict or hashtable. */ + hashtable *ht = NULL; + int shallow_copied_list_items = 0; if (o == NULL) { - ht = NULL; - } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) { + shallow_copied_list_items = 1; + } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) { ht = o->ptr; - } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) { + shallow_copied_list_items = 1; + } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HASHTABLE) { ht = o->ptr; + shallow_copied_list_items = 1; } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - ht = zs->dict; + ht = zs->ht; + /* scanning ZSET allocates temporary strings even though it's a dict */ + shallow_copied_list_items = 0; } list *keys = listCreate(); - /* Set a free callback for the contents of the collected keys list. - * For the main keyspace dict, and when we scan a key that's dict encoded - * (we have 'ht'), we don't need to define free method because the strings - * in the list are just a shallow copy from the pointer in the dictEntry. - * When scanning a key with other encodings (e.g. listpack), we need to - * free the temporary strings we add to that list. - * The exception to the above is ZSET, where we do allocate temporary - * strings even when scanning a dict. */ - if (o && (!ht || o->type == OBJ_ZSET)) { - listSetFreeMethod(keys, (void (*)(void *))sdsfree); - } - - /* For main dictionary scan or data structure using hashtable. */ + /* Set a free callback for the contents of the collected keys list if they + * are deep copied temporary strings. We must not free them if they are just + * a shallow copy - a pointer to the actual data in the data structure */ + if (!shallow_copied_list_items) { + listSetFreeMethod(keys, sdsfreeVoid); + } + + /* For main hash table scan or scannable data structure. */ if (!o || ht) { /* We set the max number of iterations to ten times the specified * COUNT, so if the hash table is in a pathological state (very @@ -1188,7 +1199,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { /* We pass scanData which have three pointers to the callback: * 1. data.keys: the list to which it will add new elements; - * 2. data.o: the object containing the dictionary so that + * 2. data.o: the object containing the hash table so that * it is possible to fetch more data in a type-dependent way; * 3. data.type: the specified type scan in the db, LLONG_MAX means * type matching is no needed; @@ -1220,7 +1231,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { if (o == NULL) { cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data); } else { - cursor = dictScan(ht, cursor, scanCallback, &data); + cursor = hashtableScan(ht, cursor, hashtableScanCallback, &data); } } while (cursor && maxiterations-- && data.sampled < count); } else if (o->type == OBJ_SET) { @@ -1839,7 +1850,8 @@ void deleteExpiredKeyFromOverwriteAndPropagate(client *c, robj *keyobj) { robj *aux = server.lazyfree_lazy_expire ? shared.unlink : shared.del; rewriteClientCommandVector(c, 2, aux, keyobj); signalModifiedKey(c, c->db, keyobj); - notifyKeyspaceEvent(NOTIFY_GENERIC, "del", keyobj, c->db->id); + notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired", keyobj, c->db->id); + server.stat_expiredkeys++; } /* Propagate an implicit key deletion into replicas and the AOF file. diff --git a/src/debug.c b/src/debug.c index d63d12f762..b7f8df04fa 100644 --- a/src/debug.c +++ b/src/debug.c @@ -38,6 +38,7 @@ #include "threads_mngr.h" #include "io_threads.h" #include "sds.h" +#include "module.h" #include #include @@ -205,20 +206,20 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) } } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; + hashtableIterator iter; + hashtableInitIterator(&iter, zs->ht, 0); - while ((de = dictNext(di)) != NULL) { - sds sdsele = dictGetKey(de); - double *score = dictGetVal(de); - const int len = fpconv_dtoa(*score, buf); + void *next; + while (hashtableNext(&iter, &next)) { + zskiplistNode *node = next; + const int len = fpconv_dtoa(node->score, buf); buf[len] = '\0'; memset(eledigest, 0, 20); - mixDigest(eledigest, sdsele, sdslen(sdsele)); + mixDigest(eledigest, node->ele, sdslen(node->ele)); mixDigest(eledigest, buf, strlen(buf)); xorDigest(digest, eledigest, 20); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { serverPanic("Unknown sorted set encoding"); } @@ -230,7 +231,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) sds sdsele; memset(eledigest, 0, 20); - sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY); + sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_FIELD); mixDigest(eledigest, sdsele, sdslen(sdsele)); sdsfree(sdsele); sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); @@ -263,7 +264,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) ValkeyModuleDigest md = {{0}, {0}, keyobj, db->id}; moduleValue *mv = o->ptr; moduleType *mt = mv->type; - moduleInitDigestContext(md); + moduleInitDigestContext(&md); if (mt->digest) { mt->digest(&md, mv->value); xorDigest(digest, md.x, sizeof(md.x)); @@ -283,23 +284,23 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) * a different digest. */ void computeDatasetDigest(unsigned char *final) { unsigned char digest[20]; - robj *o; - int j; uint32_t aux; memset(final, 0, 20); /* Start with a clean result */ - for (j = 0; j < server.dbnum; j++) { + for (int j = 0; j < server.dbnum; j++) { serverDb *db = server.db + j; if (kvstoreSize(db->keys) == 0) continue; - kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys); + kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES); /* hash the DB id, so the same dataset moved in a different DB will lead to a different digest */ aux = htonl(j); mixDigest(final, &aux, sizeof(aux)); /* Iterate this DB writing every entry */ - while (kvstoreIteratorNext(kvs_it, (void **)&o)) { + void *next; + while (kvstoreIteratorNext(kvs_it, &next)) { + robj *o = next; sds key; robj *keyobj; @@ -916,30 +917,29 @@ void debugCommand(client *c) { addReplyVerbatim(c, stats, sdslen(stats), "txt"); sdsfree(stats); } else if (!strcasecmp(c->argv[1]->ptr, "htstats-key") && c->argc >= 3) { - robj *o; - dict *ht = NULL; int full = 0; - if (c->argc >= 4 && !strcasecmp(c->argv[3]->ptr, "full")) full = 1; - if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr)) == NULL) return; + robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr); + if (o == NULL) return; - /* Get the hash table reference from the object, if possible. */ + /* Get the hashtable reference from the object, if possible. */ + hashtable *ht = NULL; switch (o->encoding) { case OBJ_ENCODING_SKIPLIST: { zset *zs = o->ptr; - ht = zs->dict; + ht = zs->ht; } break; - case OBJ_ENCODING_HT: ht = o->ptr; break; + case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break; } - if (ht == NULL) { - addReplyError(c, "The value stored at the specified key is not " - "represented using an hash table"); - } else { + if (ht != NULL) { char buf[4096]; - dictGetStats(buf, sizeof(buf), ht, full); + hashtableGetStats(buf, sizeof(buf), ht, full); addReplyVerbatim(c, buf, strlen(buf), "txt"); + } else { + addReplyError(c, "The value stored at the specified key is not " + "represented using an hash table"); } } else if (!strcasecmp(c->argv[1]->ptr, "change-repl-id") && c->argc == 2) { serverLog(LL_NOTICE, "Changing replication IDs after receiving DEBUG change-repl-id"); diff --git a/src/defrag.c b/src/defrag.c index 057fdd50de..fb98da96c7 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -34,6 +34,9 @@ */ #include "server.h" +#include "hashtable.h" +#include "script.h" +#include "module.h" #include #ifdef HAVE_DEFRAG @@ -84,7 +87,7 @@ struct DefragContext { long long timeproc_id; // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) monotime timeproc_end_time; // Ending time of previous timerproc execution - long timeproc_overage_us; // A correction value if over/under target CPU percent + long timeproc_overage_us; // A correction value if over target CPU percent }; static struct DefragContext defrag; @@ -120,7 +123,7 @@ typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdat // Private data for main dictionary keys typedef struct { kvstoreIterState kvstate; - serverDb *db; + int dbid; } defragKeysCtx; static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); @@ -147,11 +150,6 @@ static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper static list *defrag_later; static unsigned long defrag_later_cursor; - -/* this method was added to jemalloc in order to help us understand which - * pointers are worthwhile moving and which aren't */ -int je_get_defrag_hint(void *ptr); - /* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block. * It is the responsibility of the caller to free the old block if a non-NULL value (new block) * is returned. (Returns NULL if no relocation was needed.) @@ -299,55 +297,45 @@ static void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode } } -/* Defrag helper for sorted set. - * Update the robj pointer, defrag the skiplist struct and return the new score - * reference. We may not access oldele pointer (not even the pointer stored in - * the skiplist), as it was already freed. Newele may be null, in which case we - * only need to defrag the skiplist, but not update the obj pointer. - * When return value is non-NULL, it is the score reference that must be updated - * in the dict record. */ -static double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x, *newx; - int i; - sds ele = newele ? newele : oldele; - - /* find the skiplist node referring to the object that was moved, - * and all pointers that need to be updated if we'll end up moving the skiplist node. */ - x = zsl->header; - for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && x->level[i].forward->ele != oldele && /* make sure not to access the - ->obj pointer if it matches - oldele */ - (x->level[i].forward->score < score || - (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) - x = x->level[i].forward; +/* Hashtable scan callback for sorted set. It defragments a single skiplist + * node, updates skiplist pointers, and updates the hashtable pointer to the + * node. */ +static void activeDefragZsetNode(void *privdata, void *entry_ref) { + zskiplist *zsl = privdata; + zskiplistNode **node_ref = (zskiplistNode **)entry_ref; + zskiplistNode *node = *node_ref; + + /* defragment node internals */ + sds newsds = activeDefragSds(node->ele); + if (newsds) node->ele = newsds; + + const double score = node->score; + const sds ele = node->ele; + + /* find skiplist pointers that need to be updated if we end up moving the + * skiplist node. */ + zskiplistNode *update[ZSKIPLIST_MAXLEVEL]; + zskiplistNode *x = zsl->header; + for (int i = zsl->level - 1; i >= 0; i--) { + /* stop when we've reached the end of this level or the next node comes + * after our target in sorted order */ + zskiplistNode *next = x->level[i].forward; + while (next && + (next->score < score || + (next->score == score && sdscmp(next->ele, ele) < 0))) { + x = next; + next = x->level[i].forward; + } update[i] = x; } - - /* update the robj pointer inside the skip list record. */ - x = x->level[0].forward; - serverAssert(x && score == x->score && x->ele == oldele); - if (newele) x->ele = newele; + /* should have arrived at intended node */ + serverAssert(x->level[0].forward == node); /* try to defrag the skiplist record itself */ - newx = activeDefragAlloc(x); - if (newx) { - zslUpdateNode(zsl, x, newx, update); - return &newx->score; - } - return NULL; -} - -/* Defrag helper for sorted set. - * Defrag a single dict entry key name, and corresponding skiplist struct */ -static void activeDefragZsetEntry(zset *zs, dictEntry *de) { - sds newsds; - double *newscore; - sds sdsele = dictGetKey(de); - if ((newsds = activeDefragSds(sdsele))) dictSetKey(zs->dict, de, newsds); - newscore = zslDefrag(zs->zsl, *(double *)dictGetVal(de), sdsele, newsds); - if (newscore) { - dictSetVal(zs->dict, de, newscore); + zskiplistNode *newnode = activeDefragAlloc(node); + if (newnode) { + zslUpdateNode(zsl, node, newnode, update); + *node_ref = newnode; /* update hashtable pointer */ } } @@ -378,6 +366,13 @@ static void activeDefragSdsDict(dict *d, int val_type) { } while (cursor != 0); } +void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) { + UNUSED(privdata); + sds *sds_ref = (sds *)entry_ref; + sds new_sds = activeDefragSds(*sds_ref); + if (new_sds != NULL) *sds_ref = new_sds; +} + /* Defrag a list of ptr, sds or robj string values */ static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { quicklistNode *newnode, *node = *node_ref; @@ -410,7 +405,7 @@ static void activeDefragQuickListNodes(quicklist *ql) { static void defragLater(robj *obj) { if (!defrag_later) { defrag_later = listCreate(); - listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree); + listSetFreeMethod(defrag_later, sdsfreeVoid); defrag_later_cursor = 0; } sds key = sdsdup(objectGetKey(obj)); @@ -460,24 +455,15 @@ static long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) { return bookmark_failed ? 1 : 0; } -typedef struct { - zset *zs; -} scanLaterZsetData; - -static void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { - dictEntry *de = (dictEntry *)_de; - scanLaterZsetData *data = privdata; - activeDefragZsetEntry(data->zs, de); +static void scanLaterZsetCallback(void *privdata, void *element_ref) { + activeDefragZsetNode(privdata, element_ref); server.stat_active_defrag_scanned++; } static void scanLaterZset(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return; zset *zs = (zset *)ob->ptr; - dict *d = zs->dict; - scanLaterZsetData data = {zs}; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc}; - *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, &defragfns, &data); + *cursor = hashtableScanDefrag(zs->ht, *cursor, scanLaterZsetCallback, zs->zsl, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } /* Used as hashtable scan callback when all we need is to defrag the hashtable @@ -488,28 +474,25 @@ static void scanHashtableCallbackCountScanned(void *privdata, void *elemref) { server.stat_active_defrag_scanned++; } -/* Used as dict scan callback when all the work is done in the dictDefragFunctions. */ -static void scanCallbackCountScanned(void *privdata, const dictEntry *de) { - UNUSED(privdata); - UNUSED(de); - server.stat_active_defrag_scanned++; +static void scanLaterSet(robj *ob, unsigned long *cursor) { + if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return; + hashtable *ht = ob->ptr; + *cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } -static void scanLaterSet(robj *ob, unsigned long *cursor) { - if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return; - dict *d = ob->ptr; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragKey = (dictDefragAllocFunction *)activeDefragSds}; - *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); +/* Hashtable scan callback for hash datatype */ +static void activeDefragHashTypeEntry(void *privdata, void *element_ref) { + UNUSED(privdata); + hashTypeEntry **entry_ref = (hashTypeEntry **)element_ref; + + hashTypeEntry *new_entry = hashTypeEntryDefrag(*entry_ref, activeDefragAlloc, activeDefragSds); + if (new_entry) *entry_ref = new_entry; } static void scanLaterHash(robj *ob, unsigned long *cursor) { - if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return; - dict *d = ob->ptr; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragKey = (dictDefragAllocFunction *)activeDefragSds, - .defragVal = (dictDefragAllocFunction *)activeDefragSds}; - *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); + if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HASHTABLE) return; + hashtable *ht = ob->ptr; + *cursor = hashtableScanDefrag(ht, *cursor, activeDefragHashTypeEntry, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } static void defragQuicklist(robj *ob) { @@ -523,51 +506,59 @@ static void defragQuicklist(robj *ob) { } static void defragZsetSkiplist(robj *ob) { + serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST); zset *zs = (zset *)ob->ptr; + zset *newzs; zskiplist *newzsl; - dict *newdict; - dictEntry *de; struct zskiplistNode *newheader; - serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST); if ((newzs = activeDefragAlloc(zs))) ob->ptr = zs = newzs; if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl; if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader; - if (dictSize(zs->dict) > server.active_defrag_max_scan_fields) + + hashtable *newtable; + if ((newtable = hashtableDefragTables(zs->ht, activeDefragAlloc))) zs->ht = newtable; + + if (hashtableSize(zs->ht) > server.active_defrag_max_scan_fields) defragLater(ob); else { - dictIterator *di = dictGetIterator(zs->dict); - while ((de = dictNext(di)) != NULL) { - activeDefragZsetEntry(zs, de); - } - dictReleaseIterator(di); + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(zs->ht, cursor, activeDefragZsetNode, zs->zsl, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); } - /* defrag the dict struct and tables */ - if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict; } static void defragHash(robj *ob) { - dict *d, *newd; - serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT); - d = ob->ptr; - if (dictSize(d) > server.active_defrag_max_scan_fields) + serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HASHTABLE); + hashtable *ht = ob->ptr; + if (hashtableSize(ht) > server.active_defrag_max_scan_fields) { defragLater(ob); - else - activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS); - /* defrag the dict struct and tables */ - if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; + } else { + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(ht, cursor, activeDefragHashTypeEntry, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); + } + /* defrag the hashtable struct and tables */ + hashtable *new_hashtable = hashtableDefragTables(ht, activeDefragAlloc); + if (new_hashtable) ob->ptr = new_hashtable; } static void defragSet(robj *ob) { - dict *d, *newd; - serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT); - d = ob->ptr; - if (dictSize(d) > server.active_defrag_max_scan_fields) + serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE); + hashtable *ht = ob->ptr; + if (hashtableSize(ht) > server.active_defrag_max_scan_fields) { defragLater(ob); - else - activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL); - /* defrag the dict struct and tables */ - if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; + } else { + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); + } + /* defrag the hashtable struct and tables */ + hashtable *new_hashtable = hashtableDefragTables(ht, activeDefragAlloc); + if (new_hashtable) ob->ptr = new_hashtable; } /* Defrag callback for radix tree iterator, called for each node, @@ -735,7 +726,7 @@ static void defragModule(serverDb *db, robj *obj) { /* for each key we scan in the main dict, this function will attempt to defrag * all the various pointers it has. */ static void defragKey(defragKeysCtx *ctx, robj **elemref) { - serverDb *db = ctx->db; + serverDb *db = &server.db[ctx->dbid]; int slot = ctx->kvstate.slot; robj *newob, *ob; unsigned char *newzl; @@ -765,7 +756,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) { serverPanic("Unknown list encoding"); } } else if (ob->type == OBJ_SET) { - if (ob->encoding == OBJ_ENCODING_HT) { + if (ob->encoding == OBJ_ENCODING_HASHTABLE) { defragSet(ob); } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) { void *newptr, *ptr = ob->ptr; @@ -784,7 +775,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) { } else if (ob->type == OBJ_HASH) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; - } else if (ob->encoding == OBJ_ENCODING_HT) { + } else if (ob->encoding == OBJ_ENCODING_HASHTABLE) { defragHash(ob); } else { serverPanic("Unknown hash encoding"); @@ -809,29 +800,6 @@ static void dbKeysScanCallback(void *privdata, void *elemref) { server.stat_active_defrag_scanned++; } -/* Utility function to get the fragmentation ratio from jemalloc. - * It is critical to do that by comparing only heap maps that belong to - * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this - * fragmentation ratio in order to decide if a defrag action should be taken - * or not, a false detection can cause the defragmenter to waste a lot of CPU - * without the possibility of getting any results. */ -static float getAllocatorFragmentation(size_t *out_frag_bytes) { - size_t resident, active, allocated, frag_smallbins_bytes; - zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); - frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); - /* Calculate the fragmentation ratio as the proportion of wasted memory in small - * bins (which are defraggable) relative to the total allocated memory (including large bins). - * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, - * despite the fact it's not a lot of memory for the user. */ - float frag_pct = (float)frag_smallbins_bytes / allocated * 100; - float rss_pct = ((float)resident / allocated) * 100 - 100; - size_t rss_bytes = resident - allocated; - if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes; - serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", - allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); - return frag_pct; -} - /* Defrag scan callback for a pubsub channels hashtable. */ static void defragPubsubScanCallback(void *privdata, void *elemref) { defragPubSubCtx *ctx = privdata; @@ -919,7 +887,7 @@ static doneStatus defragLaterStep(monotime endtime, void *privdata) { robj *ob = found; long long key_defragged = server.stat_active_defrag_hits; - bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->db->id) == 1); + bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->dbid) == 1); if (key_defragged != server.stat_active_defrag_hits) { server.stat_active_defrag_key_hits++; } else { @@ -962,7 +930,10 @@ static doneStatus defragStageKvstoreHelper(monotime endtime, state.cursor = 0; return DEFRAG_NOT_DONE; } - serverAssert(kvs == state.kvs); // Shouldn't change during the stage + if (kvs != state.kvs) { + // There has been a change of the kvs (flushdb, swapdb, etc.). Just complete the stage. + return DEFRAG_DONE; + } unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; @@ -1012,26 +983,30 @@ static doneStatus defragStageKvstoreHelper(monotime endtime, } -// Note: target is a DB, (not a KVS like most stages) +// Target is a DBID static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) { UNUSED(privdata); - serverDb *db = (serverDb *)target; + int dbid = (uintptr_t)target; + serverDb *db = &server.db[dbid]; static defragKeysCtx ctx; // STATIC - this persists if (endtime == 0) { - ctx.db = db; + ctx.dbid = dbid; // Don't return yet. Call the helper with endtime==0 below. } - serverAssert(ctx.db == db); + serverAssert(ctx.dbid == dbid); return defragStageKvstoreHelper(endtime, db->keys, dbKeysScanCallback, defragLaterStep, &ctx); } +// Target is a DBID static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) { UNUSED(privdata); - return defragStageKvstoreHelper(endtime, (kvstore *)target, + int dbid = (uintptr_t)target; + serverDb *db = &server.db[dbid]; + return defragStageKvstoreHelper(endtime, db->expires, scanHashtableCallbackCountScanned, NULL, NULL); } @@ -1050,6 +1025,9 @@ static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdat UNUSED(target); UNUSED(privdata); if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + /* In case we are in the process of eval some script we do not want to replace the script being run + * so we just bail out without really defragging here. */ + if (scriptIsRunning()) return DEFRAG_DONE; activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); return DEFRAG_DONE; } @@ -1115,6 +1093,9 @@ static void endDefragCycle(bool normal_termination) { server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); server.stat_last_active_defrag_time = 0; server.active_defrag_cpu_percent = 0; + + /* Immediately check to see if we should start another defrag cycle. */ + monitorActiveDefrag(); } @@ -1157,7 +1138,7 @@ static int computeDefragCycleUs(void) { * the starvation of the timer. */ dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent); - // Also adjust for any accumulated overage(underage). + // Also adjust for any accumulated overage. dutyCycleUs -= defrag.timeproc_overage_us; defrag.timeproc_overage_us = 0; @@ -1176,8 +1157,11 @@ static int computeDefragCycleUs(void) { * computeDefragCycleUs computation. */ static int computeDelayMs(monotime intendedEndtime) { defrag.timeproc_end_time = getMonotonicUs(); - int overage = defrag.timeproc_end_time - intendedEndtime; + long overage = defrag.timeproc_end_time - intendedEndtime; defrag.timeproc_overage_us += overage; // track over/under desired CPU + /* Allow negative overage (underage) to count against existing overage, but don't allow + * underage (from short stages) to be accumulated. */ + if (defrag.timeproc_overage_us < 0) defrag.timeproc_overage_us = 0; int targetCpuPercent = server.active_defrag_cpu_percent; serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); @@ -1189,7 +1173,7 @@ static int computeDelayMs(monotime intendedEndtime) { long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent; long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us; // Only increase delay by the fraction of the overage that would be non-duty-cycle - delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative + delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; if (delayUs < 0) delayUs = 0; long delayMs = delayUs / 1000; // round down return delayMs; @@ -1219,29 +1203,38 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i } monotime starttime = getMonotonicUs(); - monotime endtime = starttime + computeDefragCycleUs(); + int dutyCycleUs = computeDefragCycleUs(); + monotime endtime = starttime + dutyCycleUs; + bool haveMoreWork = true; mstime_t latency; latencyStartMonitor(latency); - if (!defrag.current_stage) { - defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages)); - listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages)); - // Initialize the stage with endtime==0 - doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata); - serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE - } + do { + if (!defrag.current_stage) { + defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages)); + listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages)); + // Initialize the stage with endtime==0 + doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata); + serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE + } - doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata); - if (status == DEFRAG_DONE) { - zfree(defrag.current_stage); - defrag.current_stage = NULL; - } + doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata); + if (status == DEFRAG_DONE) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; + } + + haveMoreWork = (defrag.current_stage || listLength(defrag.remaining_stages) > 0); + /* If we've completed a stage early, and still have a standard time allotment remaining, + * we'll start another stage. This can happen when defrag is running infrequently, and + * starvation protection has increased the duty-cycle. */ + } while (haveMoreWork && getMonotonicUs() <= endtime - server.active_defrag_cycle_us); latencyEndMonitor(latency); latencyAddSampleIfNeeded("active-defrag-cycle", latency); - if (defrag.current_stage || listLength(defrag.remaining_stages) > 0) { + if (haveMoreWork) { return computeDelayMs(endtime); } else { endDefragCycle(true); @@ -1254,6 +1247,9 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i * actions. This interface allows defrag to continue running, avoiding a single long defrag step * after the long operation completes. */ void defragWhileBlocked(void) { + // This is called infrequently, while timers are not active. We might need to start defrag. + if (!defragIsRunning()) monitorActiveDefrag(); + if (!defragIsRunning()) return; // Save off the timeproc_id. If we have a normal termination, it will be cleared. @@ -1277,9 +1273,8 @@ static void beginDefragCycle(void) { defrag.remaining_stages = listCreate(); for (int dbid = 0; dbid < server.dbnum; dbid++) { - serverDb *db = &server.db[dbid]; - addDefragStage(defragStageDbKeys, db, NULL); - addDefragStage(defragStageExpiresKvstore, db->expires, NULL); + addDefragStage(defragStageDbKeys, (void *)(uintptr_t)dbid, NULL); + addDefragStage(defragStageExpiresKvstore, (void *)(uintptr_t)dbid, NULL); } static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels}; diff --git a/src/eval.c b/src/eval.c index a9c50cdf90..62780447a9 100644 --- a/src/eval.c +++ b/src/eval.c @@ -204,7 +204,7 @@ void scriptingInit(int setup) { * and we need to free them respectively. */ lctx.lua_scripts = dictCreate(&shaScriptObjectDictType); lctx.lua_scripts_lru_list = listCreate(); - listSetFreeMethod(lctx.lua_scripts_lru_list, (void (*)(void *))sdsfree); + listSetFreeMethod(lctx.lua_scripts_lru_list, sdsfreeVoid); lctx.lua_scripts_mem = 0; luaRegisterServerAPI(lua); @@ -285,6 +285,7 @@ void scriptingInit(int setup) { void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) { dictRelease(lua_scripts); listRelease(lua_scripts_lru_list); + lua_gc(lua, LUA_GCCOLLECT, 0); lua_close(lua); #if !defined(USE_LIBC) @@ -777,7 +778,7 @@ void ldbInit(void) { ldb.conn = NULL; ldb.active = 0; ldb.logs = listCreate(); - listSetFreeMethod(ldb.logs, (void (*)(void *))sdsfree); + listSetFreeMethod(ldb.logs, sdsfreeVoid); ldb.children = listCreate(); ldb.src = NULL; ldb.lines = 0; diff --git a/src/evict.c b/src/evict.c index eecd000a4b..d4bfade4fc 100644 --- a/src/evict.c +++ b/src/evict.c @@ -642,9 +642,9 @@ int performEvictions(void) { kvs = db->expires; } int slot = kvstoreGetFairRandomHashtableIndex(kvs); - int found = kvstoreHashtableRandomEntry(kvs, slot, (void **)&valkey); - if (found) { - bestkey = objectGetKey(valkey); + void *entry; + if (kvstoreHashtableRandomEntry(kvs, slot, &entry)) { + bestkey = objectGetKey((robj *)entry); bestdbid = j; break; } diff --git a/src/function_lua.c b/src/function_lua.c index fa9983bf7e..59c16eae54 100644 --- a/src/function_lua.c +++ b/src/function_lua.c @@ -39,6 +39,7 @@ * Uses script_lua.c to run the Lua code. */ +#include "scripting_engine.h" #include "functions.h" #include "script_lua.h" #include @@ -64,17 +65,14 @@ typedef struct luaFunctionCtx { } luaFunctionCtx; typedef struct loadCtx { - functionLibInfo *li; + list *functions; monotime start_time; size_t timeout; } loadCtx; -typedef struct registerFunctionArgs { - sds name; - sds desc; - luaFunctionCtx *lua_f_ctx; - uint64_t f_flags; -} registerFunctionArgs; +static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function); /* Hook for FUNCTION LOAD execution. * Used to cancel the execution in case of a timeout (500ms). @@ -93,15 +91,42 @@ static void luaEngineLoadHook(lua_State *lua, lua_Debug *ar) { } } +static void freeCompiledFunc(ValkeyModuleCtx *module_ctx, + luaEngineCtx *lua_engine_ctx, + void *compiled_func) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + + compiledFunction *func = compiled_func; + decrRefCount(func->name); + if (func->desc) { + decrRefCount(func->desc); + } + luaEngineFreeFunction(module_ctx, lua_engine_ctx, func->function); + zfree(func); +} + /* - * Compile a given blob and save it on the registry. - * Return a function ctx with Lua ref that allows to later retrieve the - * function from the registry. + * Compile a given script code by generating a set of compiled functions. These + * functions are also saved into the the registry of the Lua environment. + * + * Returns an array of compiled functions. The `compileFunction` struct stores a + * Lua ref that allows to later retrieve the function from the registry. + * In the `out_num_compiled_functions` parameter is returned the size of the + * array. * * Return NULL on compilation error and set the error to the err variable */ -static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size_t timeout, sds *err) { - int ret = C_ERR; +static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + robj **err) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + + compiledFunction **compiled_functions = NULL; luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; @@ -114,15 +139,16 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size lua_pop(lua, 1); /* pop the metatable */ /* compile the code */ - if (luaL_loadbuffer(lua, blob, sdslen(blob), "@user_function")) { - *err = sdscatprintf(sdsempty(), "Error compiling function: %s", lua_tostring(lua, -1)); + if (luaL_loadbuffer(lua, code, strlen(code), "@user_function")) { + sds error = sdscatfmt(sdsempty(), "Error compiling function: %s", lua_tostring(lua, -1)); + *err = createObject(OBJ_STRING, error); lua_pop(lua, 1); /* pops the error */ goto done; } serverAssert(lua_isfunction(lua, -1)); loadCtx load_ctx = { - .li = li, + .functions = listCreate(), .start_time = getMonotonicUs(), .timeout = timeout, }; @@ -133,13 +159,32 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size if (lua_pcall(lua, 0, 0, 0)) { errorInfo err_info = {0}; luaExtractErrorInformation(lua, &err_info); - *err = sdscatprintf(sdsempty(), "Error registering functions: %s", err_info.msg); + sds error = sdscatfmt(sdsempty(), "Error registering functions: %s", err_info.msg); + *err = createObject(OBJ_STRING, error); lua_pop(lua, 1); /* pops the error */ luaErrorInformationDiscard(&err_info); + listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD); + listNode *node = NULL; + while ((node = listNext(iter)) != NULL) { + freeCompiledFunc(module_ctx, lua_engine_ctx, listNodeValue(node)); + } + listReleaseIterator(iter); + listRelease(load_ctx.functions); goto done; } - ret = C_OK; + compiled_functions = + zcalloc(sizeof(compiledFunction *) * listLength(load_ctx.functions)); + listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD); + listNode *node = NULL; + *out_num_compiled_functions = 0; + while ((node = listNext(iter)) != NULL) { + compiledFunction *func = listNodeValue(node); + compiled_functions[*out_num_compiled_functions] = func; + (*out_num_compiled_functions)++; + } + listReleaseIterator(iter); + listRelease(load_ctx.functions); done: /* restore original globals */ @@ -152,19 +197,23 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size lua_sethook(lua, NULL, 0, 0); /* Disable hook */ luaSaveOnRegistry(lua, REGISTRY_LOAD_CTX_NAME, NULL); - return ret; + return compiled_functions; } /* * Invole the give function with the given keys and args */ -static void luaEngineCall(scriptRunCtx *run_ctx, - void *engine_ctx, +static void luaEngineCall(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + functionCtx *func_ctx, void *compiled_function, robj **keys, size_t nkeys, robj **args, size_t nargs) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; luaFunctionCtx *f_ctx = compiled_function; @@ -177,25 +226,38 @@ static void luaEngineCall(scriptRunCtx *run_ctx, serverAssert(lua_isfunction(lua, -1)); + scriptRunCtx *run_ctx = (scriptRunCtx *)func_ctx; luaCallFunction(run_ctx, lua, keys, nkeys, args, nargs, 0); lua_pop(lua, 1); /* Pop error handler */ } -static size_t luaEngineGetUsedMemoy(void *engine_ctx) { +static engineMemoryInfo luaEngineGetMemoryInfo(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + luaEngineCtx *lua_engine_ctx = engine_ctx; - return luaMemory(lua_engine_ctx->lua); + + return (engineMemoryInfo){ + .used_memory = luaMemory(lua_engine_ctx->lua), + .engine_memory_overhead = zmalloc_size(lua_engine_ctx), + }; } -static size_t luaEngineFunctionMemoryOverhead(void *compiled_function) { +static size_t luaEngineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx, + void *compiled_function) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + return zmalloc_size(compiled_function); } -static size_t luaEngineMemoryOverhead(void *engine_ctx) { - luaEngineCtx *lua_engine_ctx = engine_ctx; - return zmalloc_size(lua_engine_ctx); -} +static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); -static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) { luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; luaFunctionCtx *f_ctx = compiled_function; @@ -203,26 +265,19 @@ static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) { zfree(f_ctx); } -static void luaRegisterFunctionArgsInitialize(registerFunctionArgs *register_f_args, - sds name, - sds desc, +static void luaRegisterFunctionArgsInitialize(compiledFunction *func, + robj *name, + robj *desc, luaFunctionCtx *lua_f_ctx, uint64_t flags) { - *register_f_args = (registerFunctionArgs){ + *func = (compiledFunction){ .name = name, .desc = desc, - .lua_f_ctx = lua_f_ctx, + .function = lua_f_ctx, .f_flags = flags, }; } -static void luaRegisterFunctionArgsDispose(lua_State *lua, registerFunctionArgs *register_f_args) { - sdsfree(register_f_args->name); - if (register_f_args->desc) sdsfree(register_f_args->desc); - lua_unref(lua, register_f_args->lua_f_ctx->lua_function_ref); - zfree(register_f_args->lua_f_ctx); -} - /* Read function flags located on the top of the Lua stack. * On success, return C_OK and set the flags to 'flags' out parameter * Return C_ERR if encounter an unknown flag. */ @@ -267,10 +322,11 @@ static int luaRegisterFunctionReadFlags(lua_State *lua, uint64_t *flags) { return ret; } -static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadNamedArgs(lua_State *lua, + compiledFunction *func) { char *err = NULL; - sds name = NULL; - sds desc = NULL; + robj *name = NULL; + robj *desc = NULL; luaFunctionCtx *lua_f_ctx = NULL; uint64_t flags = 0; if (!lua_istable(lua, 1)) { @@ -287,14 +343,15 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs err = "named argument key given to server.register_function is not a string"; goto error; } + const char *key = lua_tostring(lua, -2); if (!strcasecmp(key, "function_name")) { - if (!(name = luaGetStringSds(lua, -1))) { + if (!(name = luaGetStringObject(lua, -1))) { err = "function_name argument given to server.register_function must be a string"; goto error; } } else if (!strcasecmp(key, "description")) { - if (!(desc = luaGetStringSds(lua, -1))) { + if (!(desc = luaGetStringObject(lua, -1))) { err = "description argument given to server.register_function must be a string"; goto error; } @@ -335,13 +392,17 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs goto error; } - luaRegisterFunctionArgsInitialize(register_f_args, name, desc, lua_f_ctx, flags); + luaRegisterFunctionArgsInitialize(func, + name, + desc, + lua_f_ctx, + flags); return C_OK; error: - if (name) sdsfree(name); - if (desc) sdsfree(desc); + if (name) decrRefCount(name); + if (desc) decrRefCount(desc); if (lua_f_ctx) { lua_unref(lua, lua_f_ctx->lua_function_ref); zfree(lua_f_ctx); @@ -350,11 +411,12 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs return C_ERR; } -static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, + compiledFunction *func) { char *err = NULL; - sds name = NULL; + robj *name = NULL; luaFunctionCtx *lua_f_ctx = NULL; - if (!(name = luaGetStringSds(lua, 1))) { + if (!(name = luaGetStringObject(lua, 1))) { err = "first argument to server.register_function must be a string"; goto error; } @@ -369,17 +431,17 @@ static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctio lua_f_ctx = zmalloc(sizeof(*lua_f_ctx)); lua_f_ctx->lua_function_ref = lua_function_ref; - luaRegisterFunctionArgsInitialize(register_f_args, name, NULL, lua_f_ctx, 0); + luaRegisterFunctionArgsInitialize(func, name, NULL, lua_f_ctx, 0); return C_OK; error: - if (name) sdsfree(name); + if (name) decrRefCount(name); luaPushError(lua, err); return C_ERR; } -static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadArgs(lua_State *lua, compiledFunction *func) { int argc = lua_gettop(lua); if (argc < 1 || argc > 2) { luaPushError(lua, "wrong number of arguments to server.register_function"); @@ -387,33 +449,28 @@ static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *reg } if (argc == 1) { - return luaRegisterFunctionReadNamedArgs(lua, register_f_args); + return luaRegisterFunctionReadNamedArgs(lua, func); } else { - return luaRegisterFunctionReadPositionalArgs(lua, register_f_args); + return luaRegisterFunctionReadPositionalArgs(lua, func); } } static int luaRegisterFunction(lua_State *lua) { - registerFunctionArgs register_f_args = {0}; + compiledFunction *func = zcalloc(sizeof(*func)); loadCtx *load_ctx = luaGetFromRegistry(lua, REGISTRY_LOAD_CTX_NAME); if (!load_ctx) { + zfree(func); luaPushError(lua, "server.register_function can only be called on FUNCTION LOAD command"); return luaError(lua); } - if (luaRegisterFunctionReadArgs(lua, ®ister_f_args) != C_OK) { + if (luaRegisterFunctionReadArgs(lua, func) != C_OK) { + zfree(func); return luaError(lua); } - sds err = NULL; - if (functionLibCreateFunction(register_f_args.name, register_f_args.lua_f_ctx, load_ctx->li, register_f_args.desc, - register_f_args.f_flags, &err) != C_OK) { - luaRegisterFunctionArgsDispose(lua, ®ister_f_args); - luaPushError(lua, err); - sdsfree(err); - return luaError(lua); - } + listAddNodeTail(load_ctx->functions, func); return 0; } @@ -494,16 +551,17 @@ int luaEngineInitEngine(void) { lua_enablereadonlytable(lua_engine_ctx->lua, -1, 1); /* protect the new global table */ lua_replace(lua_engine_ctx->lua, LUA_GLOBALSINDEX); /* set new global table as the new globals */ - - engine *lua_engine = zmalloc(sizeof(*lua_engine)); - *lua_engine = (engine){ - .engine_ctx = lua_engine_ctx, - .create = luaEngineCreate, - .call = luaEngineCall, - .get_used_memory = luaEngineGetUsedMemoy, + engineMethods lua_engine_methods = { + .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + .create_functions_library = luaEngineCreate, + .call_function = luaEngineCall, .get_function_memory_overhead = luaEngineFunctionMemoryOverhead, - .get_engine_memory_overhead = luaEngineMemoryOverhead, .free_function = luaEngineFreeFunction, + .get_memory_info = luaEngineGetMemoryInfo, }; - return functionsRegisterEngine(LUA_ENGINE_NAME, lua_engine); + + return scriptingEngineManagerRegister(LUA_ENGINE_NAME, + NULL, + lua_engine_ctx, + &lua_engine_methods); } diff --git a/src/functions.c b/src/functions.c index b694e35252..14d8c5296e 100644 --- a/src/functions.c +++ b/src/functions.c @@ -40,8 +40,6 @@ typedef enum { restorePolicy_Replace } restorePolicy; -static size_t engine_cache_memory = 0; - /* Forward declaration */ static void engineFunctionDispose(void *obj); static void engineStatsDispose(void *obj); @@ -66,15 +64,6 @@ typedef struct functionsLibMetaData { sds code; } functionsLibMetaData; -dictType engineDictType = { - dictSdsCaseHash, /* hash function */ - dictSdsDup, /* key dup */ - dictSdsKeyCaseCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ -}; - dictType functionDictType = { dictSdsCaseHash, /* hash function */ dictSdsDup, /* key dup */ @@ -111,15 +100,14 @@ dictType librariesDictType = { NULL /* allow to expand */ }; -/* Dictionary of engines */ -static dict *engines = NULL; - /* Libraries Ctx. */ static functionsLibCtx *curr_functions_lib_ctx = NULL; static size_t functionMallocSize(functionInfo *fi) { - return zmalloc_size(fi) + sdsAllocSize(fi->name) + (fi->desc ? sdsAllocSize(fi->desc) : 0) + - fi->li->ei->engine->get_function_memory_overhead(fi->function); + return zmalloc_size(fi) + + sdsAllocSize(fi->name) + + (fi->desc ? sdsAllocSize(fi->desc) : 0) + + scriptingEngineCallGetFunctionMemoryOverhead(fi->li->engine, fi->function); } static size_t libraryMallocSize(functionLibInfo *li) { @@ -141,8 +129,8 @@ static void engineFunctionDispose(void *obj) { if (fi->desc) { sdsfree(fi->desc); } - engine *engine = fi->li->ei->engine; - engine->free_function(engine->engine_ctx, fi->function); + + scriptingEngineCallFreeFunction(fi->li->engine, fi->function); zfree(fi); } @@ -215,24 +203,33 @@ functionsLibCtx *functionsLibCtxGetCurrent(void) { return curr_functions_lib_ctx; } +static void initializeFunctionsLibEngineStats(scriptingEngine *engine, + void *context) { + functionsLibCtx *lib_ctx = (functionsLibCtx *)context; + functionsLibEngineStats *stats = zcalloc(sizeof(*stats)); + dictAdd(lib_ctx->engines_stats, scriptingEngineGetName(engine), stats); +} + /* Create a new functions ctx */ functionsLibCtx *functionsLibCtxCreate(void) { functionsLibCtx *ret = zmalloc(sizeof(functionsLibCtx)); ret->libraries = dictCreate(&librariesDictType); ret->functions = dictCreate(&functionDictType); ret->engines_stats = dictCreate(&engineStatsDictType); - dictIterator *iter = dictGetIterator(engines); - dictEntry *entry = NULL; - while ((entry = dictNext(iter))) { - engineInfo *ei = dictGetVal(entry); - functionsLibEngineStats *stats = zcalloc(sizeof(*stats)); - dictAdd(ret->engines_stats, ei->name, stats); - } - dictReleaseIterator(iter); + scriptingEngineManagerForEachEngine(initializeFunctionsLibEngineStats, ret); ret->cache_memory = 0; return ret; } +void functionsAddEngineStats(sds engine_name) { + serverAssert(curr_functions_lib_ctx != NULL); + dictEntry *entry = dictFind(curr_functions_lib_ctx->engines_stats, engine_name); + if (entry == NULL) { + functionsLibEngineStats *stats = zcalloc(sizeof(*stats)); + dictAdd(curr_functions_lib_ctx->engines_stats, engine_name, stats); + } +} + /* * Creating a function inside the given library. * On success, return C_OK. @@ -242,24 +239,34 @@ functionsLibCtx *functionsLibCtxCreate(void) { * the function will verify that the given name is following the naming format * and return an error if its not. */ -int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err) { - if (functionsVerifyName(name) != C_OK) { - *err = sdsnew("Library names can only contain letters, numbers, or underscores(_) and must be at least one " - "character long"); +static int functionLibCreateFunction(robj *name, + void *function, + functionLibInfo *li, + robj *desc, + uint64_t f_flags, + sds *err) { + serverAssert(name->type == OBJ_STRING); + serverAssert(desc == NULL || desc->type == OBJ_STRING); + + if (functionsVerifyName(name->ptr) != C_OK) { + *err = sdsnew("Function names can only contain letters, numbers, or " + "underscores(_) and must be at least one character long"); return C_ERR; } - if (dictFetchValue(li->functions, name)) { + sds name_sds = sdsdup(name->ptr); + if (dictFetchValue(li->functions, name_sds)) { *err = sdsnew("Function already exists in the library"); + sdsfree(name_sds); return C_ERR; } functionInfo *fi = zmalloc(sizeof(*fi)); *fi = (functionInfo){ - .name = name, + .name = name_sds, .function = function, .li = li, - .desc = desc, + .desc = desc ? sdsdup(desc->ptr) : NULL, .f_flags = f_flags, }; @@ -269,12 +276,12 @@ int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds return C_OK; } -static functionLibInfo *engineLibraryCreate(sds name, engineInfo *ei, sds code) { +static functionLibInfo *engineLibraryCreate(sds name, scriptingEngine *e, sds code) { functionLibInfo *li = zmalloc(sizeof(*li)); *li = (functionLibInfo){ .name = sdsdup(name), .functions = dictCreate(&libraryFunctionDictType), - .ei = ei, + .engine = e, .code = sdsdup(code), }; return li; @@ -296,7 +303,7 @@ static void libraryUnlink(functionsLibCtx *lib_ctx, functionLibInfo *li) { lib_ctx->cache_memory -= libraryMallocSize(li); /* update stats */ - functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, li->ei->name); + functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, scriptingEngineGetName(li->engine)); serverAssert(stats); stats->n_lib--; stats->n_functions -= dictSize(li->functions); @@ -316,7 +323,7 @@ static void libraryLink(functionsLibCtx *lib_ctx, functionLibInfo *li) { lib_ctx->cache_memory += libraryMallocSize(li); /* update stats */ - functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, li->ei->name); + functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, scriptingEngineGetName(li->engine)); serverAssert(stats); stats->n_lib++; stats->n_functions += dictSize(li->functions); @@ -348,7 +355,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l } else { if (!old_libraries_list) { old_libraries_list = listCreate(); - listSetFreeMethod(old_libraries_list, (void (*)(void *))engineLibraryFree); + listSetFreeMethod(old_libraries_list, engineLibraryDispose); } libraryUnlink(functions_lib_ctx_dst, old_li); listAddNodeTail(old_libraries_list, old_li); @@ -403,35 +410,29 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l return ret; } -/* Register an engine, should be called once by the engine on startup and give the following: - * - * - engine_name - name of the engine to register - * - engine_ctx - the engine ctx that should be used by the server to interact with the engine */ -int functionsRegisterEngine(const char *engine_name, engine *engine) { - sds engine_name_sds = sdsnew(engine_name); - if (dictFetchValue(engines, engine_name_sds)) { - serverLog(LL_WARNING, "Same engine was registered twice"); - sdsfree(engine_name_sds); - return C_ERR; - } - - client *c = createClient(NULL); - c->flag.deny_blocking = 1; - c->flag.script = 1; - c->flag.fake = 1; - engineInfo *ei = zmalloc(sizeof(*ei)); - *ei = (engineInfo){ - .name = engine_name_sds, - .engine = engine, - .c = c, - }; - - dictAdd(engines, engine_name_sds, ei); - - engine_cache_memory += zmalloc_size(ei) + sdsAllocSize(ei->name) + zmalloc_size(engine) + - engine->get_engine_memory_overhead(engine->engine_ctx); +static void replyEngineStats(scriptingEngine *engine, void *context) { + client *c = (client *)context; + addReplyBulkCString(c, scriptingEngineGetName(engine)); + addReplyMapLen(c, 2); + functionsLibEngineStats *e_stats = + dictFetchValue(curr_functions_lib_ctx->engines_stats, scriptingEngineGetName(engine)); + addReplyBulkCString(c, "libraries_count"); + addReplyLongLong(c, e_stats ? e_stats->n_lib : 0); + addReplyBulkCString(c, "functions_count"); + addReplyLongLong(c, e_stats ? e_stats->n_functions : 0); +} - return C_OK; +void functionsRemoveLibFromEngine(scriptingEngine *engine) { + dictIterator *iter = dictGetSafeIterator(curr_functions_lib_ctx->libraries); + dictEntry *entry = NULL; + while ((entry = dictNext(iter))) { + functionLibInfo *li = dictGetVal(entry); + if (li->engine == engine) { + libraryUnlink(curr_functions_lib_ctx, li); + engineLibraryFree(li); + } + } + dictReleaseIterator(iter); } /* @@ -463,20 +464,8 @@ void functionStatsCommand(client *c) { } addReplyBulkCString(c, "engines"); - addReplyMapLen(c, dictSize(engines)); - dictIterator *iter = dictGetIterator(engines); - dictEntry *entry = NULL; - while ((entry = dictNext(iter))) { - engineInfo *ei = dictGetVal(entry); - addReplyBulkCString(c, ei->name); - addReplyMapLen(c, 2); - functionsLibEngineStats *e_stats = dictFetchValue(curr_functions_lib_ctx->engines_stats, ei->name); - addReplyBulkCString(c, "libraries_count"); - addReplyLongLong(c, e_stats->n_lib); - addReplyBulkCString(c, "functions_count"); - addReplyLongLong(c, e_stats->n_functions); - } - dictReleaseIterator(iter); + addReplyMapLen(c, scriptingEngineManagerGetNumEngines()); + scriptingEngineManagerForEachEngine(replyEngineStats, c); } static void functionListReplyFlags(client *c, functionInfo *fi) { @@ -552,7 +541,8 @@ void functionListCommand(client *c) { addReplyBulkCString(c, "library_name"); addReplyBulkCBuffer(c, li->name, sdslen(li->name)); addReplyBulkCString(c, "engine"); - addReplyBulkCBuffer(c, li->ei->name, sdslen(li->ei->name)); + sds engine_name = scriptingEngineGetName(li->engine); + addReplyBulkCBuffer(c, engine_name, sdslen(engine_name)); addReplyBulkCString(c, "functions"); addReplyArrayLen(c, dictSize(li->functions)); @@ -632,7 +622,7 @@ static void fcallCommandGeneric(client *c, int ro) { return; } functionInfo *fi = dictGetVal(de); - engine *engine = fi->li->ei->engine; + scriptingEngine *engine = fi->li->engine; long long numkeys; /* Get the number of arguments that are keys */ @@ -649,11 +639,16 @@ static void fcallCommandGeneric(client *c, int ro) { } scriptRunCtx run_ctx; - - if (scriptPrepareForRun(&run_ctx, fi->li->ei->c, c, fi->name, fi->f_flags, ro) != C_OK) return; - - engine->call(&run_ctx, engine->engine_ctx, fi->function, c->argv + 3, numkeys, c->argv + 3 + numkeys, - c->argc - 3 - numkeys); + if (scriptPrepareForRun(&run_ctx, scriptingEngineGetClient(engine), c, fi->name, fi->f_flags, ro) != C_OK) return; + + scriptingEngineCallFunction(engine, + &run_ctx, + run_ctx.original_client, + fi->function, + c->argv + 3, + numkeys, + c->argv + 3 + numkeys, + c->argc - 3 - numkeys); scriptResetRun(&run_ctx); } @@ -953,14 +948,34 @@ void functionFreeLibMetaData(functionsLibMetaData *md) { if (md->engine) sdsfree(md->engine); } +static void freeCompiledFunctions(scriptingEngine *engine, + compiledFunction **compiled_functions, + size_t num_compiled_functions, + size_t free_function_from_idx) { + for (size_t i = 0; i < num_compiled_functions; i++) { + compiledFunction *func = compiled_functions[i]; + decrRefCount(func->name); + if (func->desc) { + decrRefCount(func->desc); + } + if (i >= free_function_from_idx) { + scriptingEngineCallFreeFunction(engine, func->function); + } + zfree(func); + } + + zfree(compiled_functions); +} + /* Compile and save the given library, return the loaded library name on success * and NULL on failure. In case on failure the err out param is set with relevant error message */ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout) { dictIterator *iter = NULL; dictEntry *entry = NULL; - functionLibInfo *new_li = NULL; functionLibInfo *old_li = NULL; functionsLibMetaData md = {0}; + functionLibInfo *new_li = NULL; + if (functionExtractLibMetaData(code, &md, err) != C_OK) { return NULL; } @@ -971,12 +986,13 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC goto error; } - engineInfo *ei = dictFetchValue(engines, md.engine); - if (!ei) { + scriptingEngine *engine = scriptingEngineManagerFind(md.engine); + if (!engine) { *err = sdscatfmt(sdsempty(), "Engine '%S' not found", md.engine); goto error; } - engine *engine = ei->engine; + + functionsAddEngineStats(md.engine); old_li = dictFetchValue(lib_ctx->libraries, md.name); if (old_li && !replace) { @@ -989,11 +1005,47 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC libraryUnlink(lib_ctx, old_li); } - new_li = engineLibraryCreate(md.name, ei, code); - if (engine->create(engine->engine_ctx, new_li, md.code, timeout, err) != C_OK) { + new_li = engineLibraryCreate(md.name, engine, code); + size_t num_compiled_functions = 0; + robj *compile_error = NULL; + compiledFunction **compiled_functions = + scriptingEngineCallCreateFunctionsLibrary(engine, + md.code, + timeout, + &num_compiled_functions, + &compile_error); + if (compiled_functions == NULL) { + serverAssert(num_compiled_functions == 0); + serverAssert(compile_error != NULL); + *err = sdsdup(compile_error->ptr); + decrRefCount(compile_error); goto error; } + serverAssert(compile_error == NULL); + + for (size_t i = 0; i < num_compiled_functions; i++) { + compiledFunction *func = compiled_functions[i]; + int ret = functionLibCreateFunction(func->name, + func->function, + new_li, + func->desc, + func->f_flags, + err); + if (ret == C_ERR) { + freeCompiledFunctions(engine, + compiled_functions, + num_compiled_functions, + i); + goto error; + } + } + + freeCompiledFunctions(engine, + compiled_functions, + num_compiled_functions, + num_compiled_functions); + if (dictSize(new_li->functions) == 0) { *err = sdsnew("No functions registered"); goto error; @@ -1063,6 +1115,7 @@ void functionLoadCommand(client *c) { timeout = 0; } if (!(library_name = functionsCreateWithLibraryCtx(code->ptr, replace, &err, curr_functions_lib_ctx, timeout))) { + serverAssert(err != NULL); addReplyErrorSds(c, err); return; } @@ -1072,28 +1125,26 @@ void functionLoadCommand(client *c) { addReplyBulkSds(c, library_name); } +static void getEngineUsedMemory(scriptingEngine *engine, void *context) { + size_t *engines_memory = (size_t *)context; + engineMemoryInfo mem_info = scriptingEngineCallGetMemoryInfo(engine); + *engines_memory += mem_info.used_memory; +} + /* Return memory usage of all the engines combine */ unsigned long functionsMemory(void) { - dictIterator *iter = dictGetIterator(engines); - dictEntry *entry = NULL; size_t engines_memory = 0; - while ((entry = dictNext(iter))) { - engineInfo *ei = dictGetVal(entry); - engine *engine = ei->engine; - engines_memory += engine->get_used_memory(engine->engine_ctx); - } - dictReleaseIterator(iter); - + scriptingEngineManagerForEachEngine(getEngineUsedMemory, &engines_memory); return engines_memory; } /* Return memory overhead of all the engines combine */ unsigned long functionsMemoryOverhead(void) { - size_t memory_overhead = dictMemUsage(engines); + size_t memory_overhead = scriptingEngineManagerGetMemoryUsage(); memory_overhead += dictMemUsage(curr_functions_lib_ctx->functions); memory_overhead += sizeof(functionsLibCtx); memory_overhead += curr_functions_lib_ctx->cache_memory; - memory_overhead += engine_cache_memory; + memory_overhead += scriptingEngineManagerGetTotalMemoryOverhead(); return memory_overhead; } @@ -1118,14 +1169,11 @@ size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx) { /* Initialize engine data structures. * Should be called once on server initialization */ int functionsInit(void) { - engines = dictCreate(&engineDictType); + curr_functions_lib_ctx = functionsLibCtxCreate(); if (luaEngineInitEngine() != C_OK) { return C_ERR; } - /* Must be initialized after engines initialization */ - curr_functions_lib_ctx = functionsLibCtxCreate(); - return C_OK; } diff --git a/src/functions.h b/src/functions.h index b199fbd06e..7f6d144365 100644 --- a/src/functions.h +++ b/src/functions.h @@ -49,58 +49,20 @@ */ #include "server.h" +#include "scripting_engine.h" #include "script.h" #include "valkeymodule.h" typedef struct functionLibInfo functionLibInfo; -typedef struct engine { - /* engine specific context */ - void *engine_ctx; - - /* Create function callback, get the engine_ctx, and function code - * engine_ctx - opaque struct that was created on engine initialization - * li - library information that need to be provided and when add functions - * code - the library code - * timeout - timeout for the library creation (0 for no timeout) - * err - description of error (if occurred) - * returns C_ERR on error and set err to be the error message */ - int (*create)(void *engine_ctx, functionLibInfo *li, sds code, size_t timeout, sds *err); - - /* Invoking a function, r_ctx is an opaque object (from engine POV). - * The r_ctx should be used by the engine to interaction with the server, - * such interaction could be running commands, set resp, or set - * replication mode - */ - void (*call)(scriptRunCtx *r_ctx, - void *engine_ctx, - void *compiled_function, - robj **keys, - size_t nkeys, - robj **args, - size_t nargs); - - /* get current used memory by the engine */ - size_t (*get_used_memory)(void *engine_ctx); - - /* Return memory overhead for a given function, - * such memory is not counted as engine memory but as general - * structs memory that hold different information */ - size_t (*get_function_memory_overhead)(void *compiled_function); - - /* Return memory overhead for engine (struct size holding the engine)*/ - size_t (*get_engine_memory_overhead)(void *engine_ctx); - - /* free the given function */ - void (*free_function)(void *engine_ctx, void *compiled_function); -} engine; - /* Hold information about an engine. * Used on rdb.c so it must be declared here. */ typedef struct engineInfo { - sds name; /* Name of the engine */ - engine *engine; /* engine callbacks that allows to interact with the engine */ - client *c; /* Client that is used to run commands */ + sds name; /* Name of the engine */ + ValkeyModule *engineModule; /* the module that implements the scripting engine */ + ValkeyModuleCtx *module_ctx; /* Scripting engine module context */ + scriptingEngine *engine; /* engine callbacks that allows to interact with the engine */ + client *c; /* Client that is used to run commands */ } engineInfo; /* Hold information about the specific function. @@ -117,13 +79,12 @@ typedef struct functionInfo { /* Hold information about the specific library. * Used on rdb.c so it must be declared here. */ struct functionLibInfo { - sds name; /* Library name */ - dict *functions; /* Functions dictionary */ - engineInfo *ei; /* Pointer to the function engine */ - sds code; /* Library code */ + sds name; /* Library name */ + dict *functions; /* Functions dictionary */ + scriptingEngine *engine; /* Pointer to the scripting engine */ + sds code; /* Library code */ }; -int functionsRegisterEngine(const char *engine_name, engine *engine_ctx); sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout); unsigned long functionsMemory(void); unsigned long functionsMemoryOverhead(void); @@ -138,7 +99,7 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx); void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)); void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async); -int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err); +void functionsRemoveLibFromEngine(scriptingEngine *engine); int luaEngineInitEngine(void); int functionsInit(void); diff --git a/src/geo.c b/src/geo.c index 75654f85a5..65f17c81db 100644 --- a/src/geo.c +++ b/src/geo.c @@ -774,7 +774,7 @@ void georadiusGeneric(client *c, int srcKeyIndex, int flags) { if (maxelelen < elelen) maxelelen = elelen; totelelen += elelen; znode = zslInsert(zs->zsl, score, gp->member); - serverAssert(dictAdd(zs->dict, gp->member, &znode->score) == DICT_OK); + serverAssert(hashtableAdd(zs->ht, znode)); gp->member = NULL; } diff --git a/src/geohash_helper.c b/src/geohash_helper.c index aa4b4743a6..c05c2f2634 100644 --- a/src/geohash_helper.c +++ b/src/geohash_helper.c @@ -48,7 +48,7 @@ /// @brief The usual PI/180 constant const double DEG_TO_RAD = 0.017453292519943295769236907684886; -/// @brief Earth's quatratic mean radius for WGS-84 +/// @brief Earth's quadratic mean radius for WGS-84 const double EARTH_RADIUS_IN_METERS = 6372797.560856; const double MERCATOR_MAX = 20037726.37; diff --git a/src/hashtable.c b/src/hashtable.c index 9d963b9ddc..23097eb246 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -300,7 +300,7 @@ typedef struct { long index; uint16_t pos_in_bucket; uint8_t table; - uint8_t safe; + uint8_t flags; union { /* Unsafe iterator fingerprint for misuse detection. */ uint64_t fingerprint; @@ -498,7 +498,7 @@ size_t nextCursor(size_t v, size_t mask) { } /* Returns the next bucket in a bucket chain, or NULL if there's no next. */ -static bucket *bucketNext(bucket *b) { +static bucket *getChildBucket(bucket *b) { return b->chained ? b->entries[ENTRIES_PER_BUCKET - 1] : NULL; } @@ -548,12 +548,12 @@ static void rehashStep(hashtable *ht) { rehashBucket(ht, b); if (b->chained) { /* Rehash and free child buckets. */ - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); b->chained = 0; b = next; while (b != NULL) { rehashBucket(ht, b); - next = bucketNext(b); + next = getChildBucket(b); zfree(b); if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket)); ht->child_buckets[0]--; @@ -708,7 +708,7 @@ static bucket *findBucket(hashtable *ht, uint64_t hash, const void *key, int *po } } } - b = bucketNext(b); + b = getChildBucket(b); } while (b != NULL); } return NULL; @@ -753,7 +753,7 @@ static void bucketConvertToUnchained(bucket *b) { * This function needs the penultimate 'before_last' bucket in the chain, to be * able to update it when the last bucket is freed. */ static void pruneLastBucket(hashtable *ht, bucket *before_last, bucket *last, int table_index) { - assert(before_last->chained && bucketNext(before_last) == last); + assert(before_last->chained && getChildBucket(before_last) == last); assert(!last->chained); assert(last->presence == 0 || __builtin_popcount(last->presence) == 1); bucketConvertToUnchained(before_last); @@ -775,10 +775,10 @@ static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int tabl assert(b->chained && !isPositionFilled(b, pos_in_bucket)); /* Find the last bucket */ bucket *before_last = b; - bucket *last = bucketNext(b); + bucket *last = getChildBucket(b); while (last->chained) { before_last = last; - last = bucketNext(last); + last = getChildBucket(last); } /* Unless the last bucket is empty, find an entry in the last bucket and * move it to the hole in b. */ @@ -800,10 +800,10 @@ static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int tabl static void compactBucketChain(hashtable *ht, size_t bucket_index, int table_index) { bucket *b = &ht->tables[table_index][bucket_index]; while (b->chained) { - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next->chained && next->presence == 0) { /* Empty bucket in the middle of the chain. Remove it from the chain. */ - bucket *next_next = bucketNext(next); + bucket *next_next = getChildBucket(next); b->entries[ENTRIES_PER_BUCKET - 1] = next_next; zfree(next); if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket)); @@ -846,7 +846,7 @@ static bucket *findBucketForInsert(hashtable *ht, uint64_t hash, int *pos_in_buc bucketConvertToChained(ht, b); ht->child_buckets[table]++; } - b = bucketNext(b); + b = getChildBucket(b); } /* Find a free slot in the bucket. There must be at least one. */ int pos; @@ -934,6 +934,72 @@ static inline incrementalFind *incrementalFindFromOpaque(hashtableIncrementalFin return (incrementalFind *)(void *)state; } +/* Prefetches all filled entries in the given bucket to optimize future memory access. */ +static void prefetchBucketEntries(bucket *b) { + if (!b->presence) return; + for (int pos = 0; pos < numBucketPositions(b); pos++) { + if (isPositionFilled(b, pos)) { + valkey_prefetch(b->entries[pos]); + } + } +} + +/* Returns the child bucket if chained, otherwise the next bucket in the table. returns NULL if neither exists. */ +static bucket *getNextBucket(bucket *current_bucket, size_t bucket_index, hashtable *ht, int table_index) { + bucket *next_bucket = NULL; + if (current_bucket->chained) { + next_bucket = getChildBucket(current_bucket); + } else { + size_t table_size = numBuckets(ht->bucket_exp[table_index]); + size_t next_index = bucket_index + 1; + if (next_index < table_size) { + next_bucket = &ht->tables[table_index][next_index]; + } + } + return next_bucket; +} + +/* This function prefetches data that will be needed in subsequent iterations: + * - The entries of the next bucket + * - The next of the next bucket + * It attempts to bring this data closer to the L1 cache to reduce future memory access latency. + * + * Cache state before this function is called(due to last call for this function): + * 1. The current bucket and its entries are likely already in cache. + * 2. The next bucket is in cache. + */ +static void prefetchNextBucketEntries(iter *iter, bucket *current_bucket) { + size_t next_index = iter->index + 1; + bucket *next_bucket = getNextBucket(current_bucket, next_index, iter->hashtable, iter->table); + if (next_bucket) { + prefetchBucketEntries(next_bucket); + bucket *next_next_bucket = getNextBucket(next_bucket, next_index + 1, iter->hashtable, iter->table); + if (next_next_bucket) { + valkey_prefetch(next_next_bucket); + } + } +} + +/* Prefetches the values associated with the entries in the given bucket by + * calling the entryPrefetchValue callback in the hashtableType */ +static void prefetchBucketValues(bucket *b, hashtable *ht) { + if (!b->presence) return; + assert(ht->type->entryPrefetchValue != NULL); + for (int pos = 0; pos < numBucketPositions(b); pos++) { + if (isPositionFilled(b, pos)) { + ht->type->entryPrefetchValue(b->entries[pos]); + } + } +} + +static inline int isSafe(iter *iter) { + return (iter->flags & HASHTABLE_ITER_SAFE); +} + +static inline int shouldPrefetchValues(iter *iter) { + return (iter->flags & HASHTABLE_ITER_PREFETCH_VALUES); +} + /* --- API functions --- */ /* Allocates and initializes a new hashtable specified by the given type. */ @@ -979,7 +1045,7 @@ void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)) { } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); /* Free allocated bucket. */ if (b != &ht->tables[table_index][idx]) { @@ -1023,7 +1089,7 @@ void *hashtableMetadata(hashtable *ht) { } /* Returns the number of entries stored. */ -size_t hashtableSize(hashtable *ht) { +size_t hashtableSize(const hashtable *ht) { return ht->used[0] + ht->used[1]; } @@ -1180,6 +1246,14 @@ hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)) { return ht1; } +/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've + * forked and memory won't be used again. See zmadvise_dontneed() */ +void dismissHashtable(hashtable *ht) { + for (int i = 0; i < 2; i++) { + zmadvise_dontneed(ht->tables[i], numBuckets(ht->bucket_exp[i]) * sizeof(bucket *)); + } +} + /* Returns 1 if an entry was found matching the key. Also points *found to it, * if found is provided. Returns 0 if no matching entry was found. */ int hashtableFind(hashtable *ht, const void *key, void **found) { @@ -1367,7 +1441,7 @@ int hashtableReplaceReallocatedEntry(hashtable *ht, const void *old_entry, void return 1; } } - b = bucketNext(b); + b = getChildBucket(b); } while (b != NULL); } return 0; @@ -1530,8 +1604,8 @@ int hashtableIncrementalFindStep(hashtableIncrementalFindState *state) { bucket_idx = data->hash & mask; } data->bucket = &ht->tables[data->table][bucket_idx]; - } else if (bucketNext(data->bucket) != NULL) { - data->bucket = bucketNext(data->bucket); + } else if (getChildBucket(data->bucket) != NULL) { + data->bucket = getChildBucket(data->bucket); } else if (data->table == 0 && ht->rehash_idx >= 0) { data->table = 1; size_t mask = expToMask(ht->bucket_exp[1]); @@ -1648,7 +1722,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next != NULL && defragfn != NULL) { next = bucketDefrag(b, next, defragfn); } @@ -1685,7 +1759,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next != NULL && defragfn != NULL) { next = bucketDefrag(b, next, defragfn); } @@ -1715,7 +1789,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next != NULL && defragfn != NULL) { next = bucketDefrag(b, next, defragfn); } @@ -1739,31 +1813,32 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f /* --- Iterator --- */ -/* Initialize a iterator, that is not allowed to insert, delete or even lookup - * entries in the hashtable, because such operations can trigger incremental - * rehashing which moves entries around and confuses the iterator. Only - * hashtableNext is allowed. Each entry is returned exactly once. Call - * hashtableResetIterator when you are done. See also - * hashtableInitSafeIterator. */ -void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) { - iter *iter; - iter = iteratorFromOpaque(iterator); - iter->hashtable = ht; - iter->table = 0; - iter->index = -1; - iter->safe = 0; -} - -/* Initialize a safe iterator, which is allowed to modify the hash table while - * iterating. It pauses incremental rehashing to prevent entries from moving - * around. Call hashtableNext to fetch each entry. You must call - * hashtableResetIterator when you are done with a safe iterator. +/* Initialize an iterator for a hashtable. * - * It's allowed to insert and replace entries. Deleting entries is only allowed - * for the entry that was just returned by hashtableNext. Deleting other entries - * is possible, but doing so can cause internal fragmentation, so don't. + * The 'flags' argument can be used to tweak the behaviour. It's a bitwise-or + * (zero means no flags) of the following: + * + * - HASHTABLE_ITER_SAFE: Use a safe iterator that can handle + * modifications to the hash table while iterating. + * - HASHTABLE_ITER_PREFETCH_VALUES: Enables prefetching of entries values, + * which can improve performance in some scenarios. Because the hashtable is generic and + * doesn't care which object we store, the callback entryPrefetchValue must be set to help + * us prefetch necessary fields of specific object types stored in the hashtable. * - * Guarantees: + * For a non-safe iterator (default, when HASHTABLE_ITER_SAFE is not set): + * It is not allowed to insert, delete or even lookup entries in the hashtable, + * because such operations can trigger incremental rehashing which moves entries + * around and confuses the iterator. Only hashtableNext is allowed. Each entry + * is returned exactly once. + * + * For a safe iterator (when HASHTABLE_ITER_SAFE is set): + * It is allowed to modify the hash table while iterating. It pauses incremental + * rehashing to prevent entries from moving around. It's allowed to insert and + * replace entries. Deleting entries is only allowed for the entry that was just + * returned by hashtableNext. Deleting other entries is possible, but doing so + * can cause internal fragmentation, so don't. + * + * Guarantees for safe iterators: * * - Entries that are in the hash table for the entire iteration are returned * exactly once. @@ -1776,18 +1851,31 @@ void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) { * * - Entries that are inserted during the iteration may or may not be returned * by the iterator. + * + * Call hashtableNext to fetch each entry. You must call hashtableResetIterator + * when you are done with the iterator. */ -void hashtableInitSafeIterator(hashtableIterator *iterator, hashtable *ht) { - hashtableInitIterator(iterator, ht); +void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht, uint8_t flags) { + iter *iter; + iter = iteratorFromOpaque(iterator); + iter->hashtable = ht; + iter->table = 0; + iter->index = -1; + iter->flags = flags; +} + +/* Reinitializes the iterator for the provided hashtable while + * preserving the flags from its previous initialization. */ +void hashtableReinitIterator(hashtableIterator *iterator, hashtable *ht) { iter *iter = iteratorFromOpaque(iterator); - iter->safe = 1; + hashtableInitIterator(iterator, ht, iter->flags); } /* Resets a stack-allocated iterator. */ void hashtableResetIterator(hashtableIterator *iterator) { iter *iter = iteratorFromOpaque(iterator); if (!(iter->index == -1 && iter->table == 0)) { - if (iter->safe) { + if (isSafe(iter)) { hashtableResumeRehashing(iter->hashtable); assert(iter->hashtable->pause_rehash >= 0); } else { @@ -1797,21 +1885,13 @@ void hashtableResetIterator(hashtableIterator *iterator) { } /* Allocates and initializes an iterator. */ -hashtableIterator *hashtableCreateIterator(hashtable *ht) { +hashtableIterator *hashtableCreateIterator(hashtable *ht, uint8_t flags) { iter *iter = zmalloc(sizeof(*iter)); hashtableIterator *opaque = iteratorToOpaque(iter); - hashtableInitIterator(opaque, ht); + hashtableInitIterator(opaque, ht, flags); return opaque; } -/* Allocates and initializes a safe iterator. */ -hashtableIterator *hashtableCreateSafeIterator(hashtable *ht) { - hashtableIterator *iterator = hashtableCreateIterator(ht); - iter *iter = iteratorFromOpaque(iterator); - iter->safe = 1; - return iterator; -} - /* Resets and frees the memory of an allocated iterator, i.e. one created using * hashtableCreate(Safe)Iterator. */ void hashtableReleaseIterator(hashtableIterator *iterator) { @@ -1827,7 +1907,7 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { while (1) { if (iter->index == -1 && iter->table == 0) { /* It's the first call to next. */ - if (iter->safe) { + if (isSafe(iter)) { hashtablePauseRehashing(iter->hashtable); iter->last_seen_size = iter->hashtable->used[iter->table]; } else { @@ -1851,10 +1931,10 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { iter->pos_in_bucket++; if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) { iter->pos_in_bucket = 0; - iter->bucket = bucketNext(iter->bucket); + iter->bucket = getChildBucket(iter->bucket); } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { /* Bucket index done. */ - if (iter->safe) { + if (isSafe(iter)) { /* If entries in this bucket chain have been deleted, * they've left empty spaces in the buckets. The chain is * not automatically compacted when rehashing is paused. If @@ -1882,6 +1962,12 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { } } bucket *b = iter->bucket; + if (iter->pos_in_bucket == 0) { + if (shouldPrefetchValues(iter)) { + prefetchBucketValues(b, iter->hashtable); + } + prefetchNextBucketEntries(iter, b); + } if (!isPositionFilled(b, iter->pos_in_bucket)) { /* No entry here. */ continue; @@ -1980,7 +2066,7 @@ hashtableStats *hashtableGetStatsHt(hashtable *ht, int table_index, int full) { unsigned long chainlen = 0; while (b->chained) { chainlen++; - b = bucketNext(b); + b = getChildBucket(b); } if (chainlen > stats->max_chain_len) { stats->max_chain_len = chainlen; @@ -2075,7 +2161,7 @@ void hashtableDump(hashtable *ht) { printf("(empty)\n"); } } - b = bucketNext(b); + b = getChildBucket(b); level++; } while (b != NULL); } @@ -2109,7 +2195,7 @@ void hashtableHistogram(hashtable *ht) { continue; } printf("%X", __builtin_popcount(b->presence)); - buckets[idx] = bucketNext(b); + buckets[idx] = getChildBucket(b); if (buckets[idx] == NULL) chains_left--; } printf("\n"); @@ -2130,7 +2216,7 @@ int hashtableLongestBucketChain(hashtable *ht) { if (++chainlen > maxlen) { maxlen = chainlen; } - b = bucketNext(b); + b = getChildBucket(b); } } } diff --git a/src/hashtable.h b/src/hashtable.h index 242531df8f..67e8a139f8 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -60,6 +60,8 @@ typedef struct { /* Callback to free an entry when it's overwritten or deleted. * Optional. */ void (*entryDestructor)(void *entry); + /* Callback to prefetch the value associated with a hashtable entry. */ + void (*entryPrefetchValue)(const void *entry); /* Callback to control when resizing should be allowed. */ int (*resizeAllowed)(size_t moreMem, double usedRatio); /* Invoked at the start of rehashing. */ @@ -91,6 +93,10 @@ typedef void (*hashtableScanFunction)(void *privdata, void *entry); /* Scan flags */ #define HASHTABLE_SCAN_EMIT_REF (1 << 0) +/* Iterator flags */ +#define HASHTABLE_ITER_SAFE (1 << 0) +#define HASHTABLE_ITER_PREFETCH_VALUES (1 << 1) + /* --- Prototypes --- */ /* Hash function (global seed) */ @@ -108,7 +114,7 @@ void hashtableRelease(hashtable *ht); void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)); hashtableType *hashtableGetType(hashtable *ht); void *hashtableMetadata(hashtable *ht); -size_t hashtableSize(hashtable *ht); +size_t hashtableSize(const hashtable *ht); size_t hashtableBuckets(hashtable *ht); size_t hashtableChainedBuckets(hashtable *ht, int table); size_t hashtableMemUsage(hashtable *ht); @@ -123,6 +129,7 @@ int hashtableTryExpand(hashtable *ht, size_t size); int hashtableExpandIfNeeded(hashtable *ht); int hashtableShrinkIfNeeded(hashtable *ht); hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)); +void dismissHashtable(hashtable *ht); /* Entries */ int hashtableFind(hashtable *ht, const void *key, void **found); @@ -143,11 +150,10 @@ int hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, void /* Iteration & scan */ size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags); -void hashtableInitIterator(hashtableIterator *iter, hashtable *ht); -void hashtableInitSafeIterator(hashtableIterator *iter, hashtable *ht); +void hashtableInitIterator(hashtableIterator *iter, hashtable *ht, uint8_t flags); +void hashtableReinitIterator(hashtableIterator *iterator, hashtable *ht); void hashtableResetIterator(hashtableIterator *iter); -hashtableIterator *hashtableCreateIterator(hashtable *ht); -hashtableIterator *hashtableCreateSafeIterator(hashtable *ht); +hashtableIterator *hashtableCreateIterator(hashtable *ht, uint8_t flags); void hashtableReleaseIterator(hashtableIterator *iter); int hashtableNext(hashtableIterator *iter, void **elemptr); diff --git a/src/hyperloglog.c b/src/hyperloglog.c index f0390b3e1e..6056bc0098 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -36,6 +36,9 @@ #include #ifdef HAVE_AVX2 +/* Define __MM_MALLOC_H to prevent importing the memory aligned + * allocation functions, which we don't use. */ +#define __MM_MALLOC_H #include #endif diff --git a/src/io_threads.c b/src/io_threads.c index 3865eb77c3..66ef4948b6 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -321,8 +321,8 @@ int trySendReadToIOThreads(client *c) { if (server.active_io_threads_num <= 1) return C_ERR; /* If IO thread is already reading, return C_OK to make sure the main thread will not handle it. */ if (c->io_read_state != CLIENT_IDLE) return C_OK; - /* Currently, replica/master writes are not offloaded and are processed synchronously. */ - if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; + /* Currently, replica reads are not offloaded to IO threads. */ + if (getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; /* With Lua debug client we may call connWrite directly in the main thread */ if (c->flag.lua_debug) return C_ERR; /* For simplicity let the main-thread handle the blocked clients */ @@ -345,6 +345,7 @@ int trySendReadToIOThreads(client *c) { c->cur_tid = tid; c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; + c->read_flags |= c->flag.primary ? READ_FLAGS_PRIMARY : 0; c->io_read_state = CLIENT_PENDING_IO; connSetPostponeUpdateState(c->conn, 1); @@ -363,8 +364,8 @@ int trySendWriteToIOThreads(client *c) { if (c->io_write_state != CLIENT_IDLE) return C_OK; /* Nothing to write */ if (!clientHasPendingReplies(c)) return C_ERR; - /* Currently, replica/master writes are not offloaded and are processed synchronously. */ - if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; + /* Currently, replica writes are not offloaded to IO threads. */ + if (getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; /* We can't offload debugged clients as the main-thread may read at the same time */ if (c->flag.lua_debug) return C_ERR; @@ -561,3 +562,55 @@ void trySendPollJobToIOThreads(void) { aeSetPollProtect(server.el, 1); IOJobQueue_push(jq, IOThreadPoll, server.el); } + +static void ioThreadAccept(void *data) { + client *c = (client *)data; + connAccept(c->conn, NULL); + c->io_read_state = CLIENT_COMPLETED_IO; +} + +/* + * Attempts to offload an Accept operation (currently used for TLS accept) for a client + * connection to I/O threads. + * + * Returns: + * C_OK - If the accept operation was successfully queued for processing + * C_ERR - If the connection is not eligible for offloading + * + * Parameters: + * conn - The connection object to perform the accept operation on + */ +int trySendAcceptToIOThreads(connection *conn) { + if (server.io_threads_num <= 1) { + return C_ERR; + } + + if (!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD)) { + return C_ERR; + } + + client *c = connGetPrivateData(conn); + if (c->io_read_state != CLIENT_IDLE) { + return C_OK; + } + + if (server.active_io_threads_num <= 1) { + return C_ERR; + } + + size_t thread_id = (c->id % (server.active_io_threads_num - 1)) + 1; + IOJobQueue *job_queue = &io_jobs[thread_id]; + + if (IOJobQueue_isFull(job_queue)) { + return C_ERR; + } + + c->io_read_state = CLIENT_PENDING_IO; + c->flag.pending_read = 1; + listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node); + connSetPostponeUpdateState(c->conn, 1); + server.stat_io_accept_offloaded++; + IOJobQueue_push(job_queue, ioThreadAccept, c); + + return C_OK; +} diff --git a/src/io_threads.h b/src/io_threads.h index 8818f08588..a3ff582a77 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -13,5 +13,6 @@ int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); void adjustIOThreadsByEventLoad(int numevents, int increase_only); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); +int trySendAcceptToIOThreads(connection *conn); #endif /* IO_THREADS_H */ diff --git a/src/kvstore.c b/src/kvstore.c index d6db4d3fe1..76bfb35d98 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -467,7 +467,7 @@ void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) { hashtableStats *mainHtStats = NULL; hashtableStats *rehashHtStats = NULL; hashtable *ht; - kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs); + kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs, HASHTABLE_ITER_SAFE); while ((ht = kvstoreIteratorNextHashtable(kvs_it))) { hashtableStats *stats = hashtableGetStatsHt(ht, 0, full); if (!mainHtStats) { @@ -576,12 +576,12 @@ int kvstoreNumHashtables(kvstore *kvs) { /* Returns kvstore iterator that can be used to iterate through sub-hash tables. * * The caller should free the resulting kvs_it with kvstoreIteratorRelease. */ -kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) { +kvstoreIterator *kvstoreIteratorInit(kvstore *kvs, uint8_t flags) { kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it)); kvs_it->kvs = kvs; kvs_it->didx = -1; kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */ - hashtableInitSafeIterator(&kvs_it->di, NULL); + hashtableInitIterator(&kvs_it->di, NULL, flags); return kvs_it; } @@ -625,7 +625,7 @@ int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) { /* No current hashtable or reached the end of the hash table. */ hashtable *ht = kvstoreIteratorNextHashtable(kvs_it); if (!ht) return 0; - hashtableInitSafeIterator(&kvs_it->di, ht); + hashtableReinitIterator(&kvs_it->di, ht); return hashtableNext(&kvs_it->di, next); } } @@ -691,23 +691,15 @@ unsigned long kvstoreHashtableSize(kvstore *kvs, int didx) { return hashtableSize(ht); } -kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx) { +kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx, uint8_t flags) { kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di)); kvs_di->kvs = kvs; kvs_di->didx = didx; - hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx)); + hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx), flags); return kvs_di; } -kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx) { - kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di)); - kvs_di->kvs = kvs; - kvs_di->didx = didx; - hashtableInitSafeIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx)); - return kvs_di; -} - -/* Free the kvs_di returned by kvstoreGetHashtableIterator and kvstoreGetHashtableSafeIterator. */ +/* Free the kvs_di returned by kvstoreGetHashtableIterator. */ void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_di) { /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */ if (kvstoreGetHashtable(kvs_di->kvs, kvs_di->didx)) { diff --git a/src/kvstore.h b/src/kvstore.h index 1a8c74a6b9..d5db1a89aa 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -43,7 +43,7 @@ void kvstoreHashtableTrackMemUsage(hashtable *s, ssize_t delta); size_t kvstoreHashtableMetadataSize(void); /* kvstore iterator specific functions */ -kvstoreIterator *kvstoreIteratorInit(kvstore *kvs); +kvstoreIterator *kvstoreIteratorInit(kvstore *kvs, uint8_t flags); void kvstoreIteratorRelease(kvstoreIterator *kvs_it); int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it); int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next); @@ -57,8 +57,7 @@ unsigned long kvstoreHashtableRehashingCount(kvstore *kvs); /* Specific hashtable access by hashtable-index */ unsigned long kvstoreHashtableSize(kvstore *kvs, int didx); -kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx); -kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx); +kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx, uint8_t flags); void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_id); int kvstoreHashtableIteratorNext(kvstoreHashtableIterator *kvs_di, void **next); int kvstoreHashtableRandomEntry(kvstore *kvs, int didx, void **found); diff --git a/src/latency.c b/src/latency.c index 2beb4859d1..7dcdaea967 100644 --- a/src/latency.c +++ b/src/latency.c @@ -266,10 +266,10 @@ sds createLatencyReport(void) { /* Potentially commands. */ if (!strcasecmp(event, "command")) { - if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) { + if (server.commandlog[COMMANDLOG_TYPE_SLOW].threshold < 0 || server.commandlog[COMMANDLOG_TYPE_SLOW].max_len == 0) { advise_slowlog_enabled = 1; advices++; - } else if (server.slowlog_log_slower_than / 1000 > server.latency_monitor_threshold) { + } else if (server.commandlog[COMMANDLOG_TYPE_SLOW].threshold / 1000 > server.latency_monitor_threshold) { advise_slowlog_tuning = 1; advices++; } @@ -528,7 +528,7 @@ void fillCommandCDF(client *c, struct hdr_histogram *histogram) { * a per command cumulative distribution of latencies. */ void latencyAllCommandsFillCDF(client *c, hashtable *commands, int *command_with_data) { hashtableIterator iter; - hashtableInitSafeIterator(&iter, commands); + hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; @@ -565,7 +565,7 @@ void latencySpecificCommandsFillCDF(client *c) { if (cmd->subcommands_ht) { hashtableIterator iter; - hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *sub = next; diff --git a/src/lazyfree.c b/src/lazyfree.c index 14a4454d7a..3b061ccd84 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -2,6 +2,7 @@ #include "bio.h" #include "functions.h" #include "cluster.h" +#include "module.h" #include @@ -116,15 +117,15 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) { if (obj->type == OBJ_LIST && obj->encoding == OBJ_ENCODING_QUICKLIST) { quicklist *ql = obj->ptr; return ql->len; - } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HT) { - dict *ht = obj->ptr; - return dictSize(ht); + } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = obj->ptr; + return hashtableSize(ht); } else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = obj->ptr; return zs->zsl->length; - } else if (obj->type == OBJ_HASH && obj->encoding == OBJ_ENCODING_HT) { - dict *ht = obj->ptr; - return dictSize(ht); + } else if (obj->type == OBJ_HASH && obj->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = obj->ptr; + return hashtableSize(ht); } else if (obj->type == OBJ_STREAM) { size_t effort = 0; stream *s = obj->ptr; diff --git a/src/listpack.c b/src/listpack.c index 2dfb321f56..76c2f9ea38 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -250,6 +250,12 @@ void lpFree(unsigned char *lp) { lp_free(lp); } +/* Same as lpFree, but useful for when you are passing the listpack + * into a generic free function that expects (void *) */ +void lpFreeVoid(void *lp) { + lp_free((unsigned char *)lp); +} + /* Shrink the memory to fit. */ unsigned char *lpShrinkToFit(unsigned char *lp) { size_t size = lpGetTotalBytes(lp); diff --git a/src/listpack.h b/src/listpack.h index aa7636143f..b143797261 100644 --- a/src/listpack.h +++ b/src/listpack.h @@ -56,6 +56,7 @@ typedef struct { unsigned char *lpNew(size_t capacity); void lpFree(unsigned char *lp); +void lpFreeVoid(void *lp); unsigned char *lpShrinkToFit(unsigned char *lp); unsigned char * lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp); diff --git a/src/module.c b/src/module.c index 9bcf68646e..37bd310427 100644 --- a/src/module.c +++ b/src/module.c @@ -53,7 +53,7 @@ #include "server.h" #include "cluster.h" -#include "slowlog.h" +#include "commandlog.h" #include "rdb.h" #include "monotonic.h" #include "script.h" @@ -62,6 +62,8 @@ #include "crc16_slottable.h" #include "valkeymodule.h" #include "io_threads.h" +#include "module.h" +#include "scripting_engine.h" #include #include #include @@ -74,6 +76,12 @@ * pointers that have an API the module can call with them) * -------------------------------------------------------------------------- */ +struct moduleLoadQueueEntry { + sds path; + int argc; + robj **argv; +}; + struct ValkeyModuleInfoCtx { struct ValkeyModule *module; dict *requested_sections; @@ -643,6 +651,48 @@ void *VM_PoolAlloc(ValkeyModuleCtx *ctx, size_t bytes) { * Helpers for modules API implementation * -------------------------------------------------------------------------- */ +static void initClientModuleData(client *c) { + if (c->module_data) return; + c->module_data = zcalloc(sizeof(ClientModuleData)); +} + +void freeClientModuleData(client *c) { + if (!c->module_data) return; + /* Free the ValkeyModuleBlockedClient held onto for reprocessing if not already freed. */ + zfree(c->module_data->module_blocked_client); + zfree(c->module_data); + c->module_data = NULL; +} + +void moduleEnqueueLoadModule(sds path, sds *argv, int argc) { + int i; + struct moduleLoadQueueEntry *loadmod; + + loadmod = zmalloc(sizeof(struct moduleLoadQueueEntry)); + loadmod->argv = argc ? zmalloc(sizeof(robj *) * argc) : NULL; + loadmod->path = sdsnew(path); + loadmod->argc = argc; + for (i = 0; i < argc; i++) { + loadmod->argv[i] = createRawStringObject(argv[i], sdslen(argv[i])); + } + listAddNodeTail(server.loadmodule_queue, loadmod); +} + +sds moduleLoadQueueEntryToLoadmoduleOptionStr(ValkeyModule *module, + const char *config_option_str) { + sds line; + + line = sdsnew(config_option_str); + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, module->loadmod->path); + for (int i = 0; i < module->loadmod->argc; i++) { + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, module->loadmod->argv[i]->ptr); + } + + return line; +} + client *moduleAllocTempClient(void) { client *c = NULL; @@ -684,11 +734,11 @@ void moduleReleaseTempClient(client *c) { c->flag.fake = 1; c->user = NULL; /* Root user */ c->cmd = c->lastcmd = c->realcmd = c->io_parsed_cmd = NULL; - if (c->bstate.async_rm_call_handle) { - ValkeyModuleAsyncRMCallPromise *promise = c->bstate.async_rm_call_handle; + if (c->bstate && c->bstate->async_rm_call_handle) { + ValkeyModuleAsyncRMCallPromise *promise = c->bstate->async_rm_call_handle; promise->c = NULL; /* Remove the client from the promise so it will no longer be possible to abort it. */ freeValkeyModuleAsyncRMCallPromise(promise); - c->bstate.async_rm_call_handle = NULL; + c->bstate->async_rm_call_handle = NULL; } moduleTempClients[moduleTempClientCount++] = c; } @@ -860,7 +910,7 @@ static CallReply *moduleParseReply(client *c, ValkeyModuleCtx *ctx) { void moduleCallCommandUnblockedHandler(client *c) { ValkeyModuleCtx ctx; - ValkeyModuleAsyncRMCallPromise *promise = c->bstate.async_rm_call_handle; + ValkeyModuleAsyncRMCallPromise *promise = c->bstate->async_rm_call_handle; serverAssert(promise); ValkeyModule *module = promise->module; if (!promise->on_unblocked) { @@ -879,6 +929,15 @@ void moduleCallCommandUnblockedHandler(client *c) { moduleReleaseTempClient(c); } +/* Allocates the memory necessary to hold the ValkeyModuleCtx structure, and + * returns the pointer to the allocated memory. + * + * Used by the scripting engines implementation to cache the context structure. + */ +ValkeyModuleCtx *moduleAllocateContext(void) { + return (ValkeyModuleCtx *)zcalloc(sizeof(ValkeyModuleCtx)); +} + /* Create a module ctx and keep track of the nesting level. * * Note: When creating ctx for threads (VM_GetThreadSafeContext and @@ -921,6 +980,16 @@ void moduleCreateContext(ValkeyModuleCtx *out_ctx, ValkeyModule *module, int ctx } } +/* Initialize a module context to be used by scripting engines callback + * functions. + */ +void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx, + ValkeyModule *module, + client *client) { + moduleCreateContext(out_ctx, module, VALKEYMODULE_CTX_NONE); + out_ctx->client = client; +} + /* This command binds the normal command invocation with commands * exported by modules. */ void ValkeyModuleCommandDispatcher(client *c) { @@ -1152,7 +1221,8 @@ int64_t commandFlagsFromString(char *s) { else if (!strcasecmp(t,"blocking")) flags |= CMD_BLOCKING; else if (!strcasecmp(t,"allow-stale")) flags |= CMD_STALE; else if (!strcasecmp(t,"no-monitor")) flags |= CMD_SKIP_MONITOR; - else if (!strcasecmp(t,"no-slowlog")) flags |= CMD_SKIP_SLOWLOG; + else if (!strcasecmp(t,"no-slowlog")) flags |= CMD_SKIP_COMMANDLOG; + else if (!strcasecmp(t,"no-commandlog")) flags |= CMD_SKIP_COMMANDLOG; else if (!strcasecmp(t,"fast")) flags |= CMD_FAST; else if (!strcasecmp(t,"no-auth")) flags |= CMD_NO_AUTH; else if (!strcasecmp(t,"may-replicate")) flags |= CMD_MAY_REPLICATE; @@ -1227,7 +1297,8 @@ ValkeyModuleCommand *moduleCreateCommandProxy(struct ValkeyModule *module, * this means. * * **"no-monitor"**: Don't propagate the command on monitor. Use this if * the command has sensitive data among the arguments. - * * **"no-slowlog"**: Don't log this command in the slowlog. Use this if + * * **"no-slowlog"**: Deprecated, please use "no-commandlog". + * * **"no-commandlog"**: Don't log this command in the commandlog. Use this if * the command has sensitive data among the arguments. * * **"fast"**: The command time complexity is not greater * than O(log(N)) where N is the size of the collection or @@ -6513,7 +6584,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const .ctx = (ctx->flags & VALKEYMODULE_CTX_AUTO_MEMORY) ? ctx : NULL, }; reply = callReplyCreatePromise(promise); - c->bstate.async_rm_call_handle = promise; + c->bstate->async_rm_call_handle = promise; if (!(call_flags & CMD_CALL_PROPAGATE_AOF)) { /* No need for AOF propagation, set the relevant flags of the client */ c->flag.module_prevent_aof_prop = 1; @@ -7381,7 +7452,7 @@ void *VM_LoadDataTypeFromStringEncver(const ValkeyModuleString *str, const modul void *ret; rioInitWithBuffer(&payload, str->ptr); - moduleInitIOContext(io, (moduleType *)mt, &payload, NULL, -1); + moduleInitIOContext(&io, (moduleType *)mt, &payload, NULL, -1); /* All VM_Save*() calls always write a version 2 compatible format, so we * need to make sure we read the same. @@ -7413,7 +7484,7 @@ ValkeyModuleString *VM_SaveDataTypeToString(ValkeyModuleCtx *ctx, void *data, co ValkeyModuleIO io; rioInitWithBuffer(&payload, sdsempty()); - moduleInitIOContext(io, (moduleType *)mt, &payload, NULL, -1); + moduleInitIOContext(&io, (moduleType *)mt, &payload, NULL, -1); mt->rdb_save(&io, data); if (io.ctx) { moduleFreeContext(io.ctx); @@ -7611,7 +7682,7 @@ void VM__Assert(const char *estr, const char *file, int line) { * command. The call is skipped if the latency is smaller than the configured * latency-monitor-threshold. */ void VM_LatencyAddSample(const char *event, mstime_t latency) { - if (latency >= server.latency_monitor_threshold) latencyAddSample(event, latency); + latencyAddSampleIfNeeded(event, latency); } /* -------------------------------------------------------------------------- @@ -7623,7 +7694,7 @@ void VM_LatencyAddSample(const char *event, mstime_t latency) { /* Returns 1 if the client already in the moduleUnblocked list, 0 otherwise. */ int isModuleClientUnblocked(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; return bc->unblocked == 1; } @@ -7641,7 +7712,7 @@ int isModuleClientUnblocked(client *c) { * The structure ValkeyModuleBlockedClient will be always deallocated when * running the list of clients blocked by a module that need to be unblocked. */ void unblockClientFromModule(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; /* Call the disconnection callback if any. Note that * bc->disconnect_callback is set to NULL if the client gets disconnected @@ -7709,9 +7780,10 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx, client *c = ctx->client; int islua = scriptIsRunning(); int ismulti = server.in_exec; + initClientBlockingState(c); - c->bstate.module_blocked_handle = zmalloc(sizeof(ValkeyModuleBlockedClient)); - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + c->bstate->module_blocked_handle = zmalloc(sizeof(ValkeyModuleBlockedClient)); + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; ctx->module->blocked_clients++; /* We need to handle the invalid operation of calling modules blocking @@ -7739,7 +7811,7 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx, if (timeout_ms) { mstime_t now = mstime(); if (timeout_ms > LLONG_MAX - now) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, "timeout is out of range"); /* 'timeout_ms+now' would overflow */ return bc; } @@ -7747,20 +7819,20 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx, } if (islua || ismulti) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, islua ? "Blocking module command called from Lua script" : "Blocking module command called from transaction"); } else if (ctx->flags & VALKEYMODULE_CTX_BLOCKED_REPLY) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, "Blocking module command called from a Reply callback context"); } else if (!auth_reply_callback && clientHasModuleAuthInProgress(c)) { - c->bstate.module_blocked_handle = NULL; + c->bstate->module_blocked_handle = NULL; addReplyError(c, "Clients undergoing module based authentication can only be blocked on auth"); } else { if (keys) { blockForKeys(c, BLOCKED_MODULE, keys, numkeys, timeout, flags & VALKEYMODULE_BLOCK_UNBLOCK_DELETED); } else { - c->bstate.timeout = timeout; + c->bstate->timeout = timeout; blockClient(c, BLOCKED_MODULE); } } @@ -7856,7 +7928,7 @@ void moduleUnregisterAuthCBs(ValkeyModule *module) { /* Search for & attempt next module auth callback after skipping the ones already attempted. * Returns the result of the module auth callback. */ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { - int handle_next_callback = c->module_auth_ctx == NULL; + int handle_next_callback = (!c->module_data || c->module_data->module_auth_ctx == NULL); ValkeyModuleAuthCtx *cur_auth_ctx = NULL; listNode *ln; listIter li; @@ -7866,7 +7938,7 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { cur_auth_ctx = listNodeValue(ln); /* Skip over the previously attempted auth contexts. */ if (!handle_next_callback) { - handle_next_callback = cur_auth_ctx == c->module_auth_ctx; + handle_next_callback = cur_auth_ctx == c->module_data->module_auth_ctx; continue; } /* Remove the module auth complete flag before we attempt the next cb. */ @@ -7875,7 +7947,8 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { moduleCreateContext(&ctx, cur_auth_ctx->module, VALKEYMODULE_CTX_NONE); ctx.client = c; *err = NULL; - c->module_auth_ctx = cur_auth_ctx; + initClientModuleData(c); + c->module_data->module_auth_ctx = cur_auth_ctx; result = cur_auth_ctx->auth_cb(&ctx, username, password, err); moduleFreeContext(&ctx); if (result == VALKEYMODULE_AUTH_HANDLED) break; @@ -7891,8 +7964,8 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) { * return the result of the reply callback. */ int attemptBlockedAuthReplyCallback(client *c, robj *username, robj *password, robj **err) { int result = VALKEYMODULE_AUTH_NOT_HANDLED; - if (!c->module_blocked_client) return result; - ValkeyModuleBlockedClient *bc = (ValkeyModuleBlockedClient *)c->module_blocked_client; + if (!c->module_data || !c->module_data->module_blocked_client) return result; + ValkeyModuleBlockedClient *bc = (ValkeyModuleBlockedClient *)c->module_data->module_blocked_client; bc->client = c; if (bc->auth_reply_cb) { ValkeyModuleCtx ctx; @@ -7905,7 +7978,7 @@ int attemptBlockedAuthReplyCallback(client *c, robj *username, robj *password, r moduleFreeContext(&ctx); } moduleInvokeFreePrivDataCallback(c, bc); - c->module_blocked_client = NULL; + c->module_data->module_blocked_client = NULL; c->lastcmd->microseconds += bc->background_duration; bc->module->blocked_clients--; zfree(bc); @@ -7933,7 +8006,7 @@ int checkModuleAuthentication(client *c, robj *username, robj *password, robj ** serverAssert(result == VALKEYMODULE_AUTH_HANDLED); return AUTH_BLOCKED; } - c->module_auth_ctx = NULL; + if (c->module_data) c->module_data->module_auth_ctx = NULL; if (result == VALKEYMODULE_AUTH_NOT_HANDLED) { c->flag.module_auth_has_result = 0; return AUTH_NOT_HANDLED; @@ -7955,7 +8028,7 @@ int checkModuleAuthentication(client *c, robj *username, robj *password, robj ** * This function returns 1 if client was served (and should be unblocked) */ int moduleTryServeClientBlockedOnKey(client *c, robj *key) { int served = 0; - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; /* Protect against re-processing: don't serve clients that are already * in the unblocking list for any reason (including VM_UnblockClient() @@ -8167,14 +8240,14 @@ int moduleUnblockClientByHandle(ValkeyModuleBlockedClient *bc, void *privdata) { /* This API is used by the server core to unblock a client that was blocked * by a module. */ void moduleUnblockClient(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; moduleUnblockClientByHandle(bc, NULL); } /* Return true if the client 'c' was blocked by a module using * VM_BlockClientOnKeys(). */ int moduleClientIsBlockedOnKeys(client *c) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; return bc->blocked_on_keys; } @@ -8284,7 +8357,7 @@ void moduleHandleBlockedClients(void) { /* Hold onto the blocked client if module auth is in progress. The reply callback is invoked * when the client is reprocessed. */ if (c && clientHasModuleAuthInProgress(c)) { - c->module_blocked_client = bc; + c->module_data->module_blocked_client = bc; } else { /* Free privdata if any. */ moduleInvokeFreePrivDataCallback(c, bc); @@ -8305,7 +8378,7 @@ void moduleHandleBlockedClients(void) { if (c && !clientHasModuleAuthInProgress(c)) { int had_errors = c->deferred_reply_errors ? !!listLength(c->deferred_reply_errors) : (server.stat_total_error_replies != prev_error_replies); - updateStatsOnUnblock(c, bc->background_duration, reply_us, had_errors); + updateStatsOnUnblock(c, bc->background_duration, reply_us, (had_errors ? ERROR_COMMAND_FAILED : 0)); } if (c != NULL) { @@ -8346,9 +8419,9 @@ void moduleHandleBlockedClients(void) { * moduleBlockedClientTimedOut(). */ int moduleBlockedClientMayTimeout(client *c) { - if (c->bstate.btype != BLOCKED_MODULE) return 1; + if (c->bstate->btype != BLOCKED_MODULE) return 1; - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; return (bc && bc->timeout_callback != NULL); } @@ -8364,7 +8437,7 @@ int moduleBlockedClientMayTimeout(client *c) { * of the client synchronously. This ensures that we can reply to the client before * resetClient() is called. */ void moduleBlockedClientTimedOut(client *c, int from_module) { - ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle; + ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle; /* Protect against re-processing: don't serve clients that are already * in the unblocking list for any reason (including VM_UnblockClient() @@ -8391,7 +8464,8 @@ void moduleBlockedClientTimedOut(client *c, int from_module) { moduleFreeContext(&ctx); if (!from_module) - updateStatsOnUnblock(c, bc->background_duration, 0, server.stat_total_error_replies != prev_error_replies); + updateStatsOnUnblock(c, bc->background_duration, 0, + ((server.stat_total_error_replies != prev_error_replies) ? ERROR_COMMAND_FAILED : 0)); /* For timeout events, we do not want to call the disconnect callback, * because the blocked client will be automatically disconnected in @@ -9502,16 +9576,16 @@ static void eventLoopHandleOneShotEvents(void) { * A client's user can be changed through the AUTH command, module * authentication, and when a client is freed. */ void moduleNotifyUserChanged(client *c) { - if (c->auth_callback) { - c->auth_callback(c->id, c->auth_callback_privdata); + if (!c->module_data || !c->module_data->auth_callback) return; - /* The callback will fire exactly once, even if the user remains - * the same. It is expected to completely clean up the state - * so all references are cleared here. */ - c->auth_callback = NULL; - c->auth_callback_privdata = NULL; - c->auth_module = NULL; - } + c->module_data->auth_callback(c->id, c->module_data->auth_callback_privdata); + + /* The callback will fire exactly once, even if the user remains + * the same. It is expected to completely clean up the state + * so all references are cleared here. */ + c->module_data->auth_callback = NULL; + c->module_data->auth_callback_privdata = NULL; + c->module_data->auth_module = NULL; } void revokeClientAuthentication(client *c) { @@ -9542,9 +9616,9 @@ static void moduleFreeAuthenticatedClients(ValkeyModule *module) { listRewind(server.clients, &li); while ((ln = listNext(&li)) != NULL) { client *c = listNodeValue(ln); - if (!c->auth_module) continue; + if (!c->module_data || !c->module_data->auth_module) continue; - ValkeyModule *auth_module = (ValkeyModule *)c->auth_module; + ValkeyModule *auth_module = (ValkeyModule *)c->module_data->auth_module; if (auth_module == module) { revokeClientAuthentication(c); } @@ -9852,9 +9926,10 @@ static int authenticateClientWithUser(ValkeyModuleCtx *ctx, } if (callback) { - ctx->client->auth_callback = callback; - ctx->client->auth_callback_privdata = privdata; - ctx->client->auth_module = ctx->module; + initClientModuleData(ctx->client); + ctx->client->module_data->auth_callback = callback; + ctx->client->module_data->auth_callback_privdata = privdata; + ctx->client->module_data->auth_module = ctx->module; } if (client_id) { @@ -10399,7 +10474,7 @@ ValkeyModuleServerInfoData *VM_GetServerInfo(ValkeyModuleCtx *ctx, const char *s * context instead of passing NULL. */ void VM_FreeServerInfo(ValkeyModuleCtx *ctx, ValkeyModuleServerInfoData *data) { if (ctx != NULL) autoMemoryFreed(ctx, VALKEYMODULE_AM_INFO, data); - raxFreeWithCallback(data->rax, (void (*)(void *))sdsfree); + raxFreeWithCallback(data->rax, sdsfreeVoid); zfree(data); } @@ -11017,21 +11092,27 @@ typedef struct { ValkeyModuleScanKeyCB fn; } ScanKeyCBData; -static void moduleScanKeyCallback(void *privdata, const dictEntry *de) { +static void moduleScanKeyHashtableCallback(void *privdata, void *entry) { ScanKeyCBData *data = privdata; - sds key = dictGetKey(de); robj *o = data->key->value; - robj *field = createStringObject(key, sdslen(key)); robj *value = NULL; + sds key = NULL; + if (o->type == OBJ_SET) { - value = NULL; + key = entry; + /* no value */ + } else if (o->type == OBJ_ZSET) { + zskiplistNode *node = (zskiplistNode *)entry; + key = node->ele; + value = createStringObjectFromLongDouble(node->score, 0); } else if (o->type == OBJ_HASH) { - sds val = dictGetVal(de); + key = hashTypeEntryGetField(entry); + sds val = hashTypeEntryGetValue(entry); value = createStringObject(val, sdslen(val)); - } else if (o->type == OBJ_ZSET) { - double *val = (double *)dictGetVal(de); - value = createStringObjectFromLongDouble(*val, 0); + } else { + serverPanic("unexpected object type"); } + robj *field = createStringObject(key, sdslen(key)); data->fn(data->key, field, value, data->user_data); decrRefCount(field); @@ -11091,14 +11172,14 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul errno = EINVAL; return 0; } - dict *ht = NULL; + hashtable *ht = NULL; robj *o = key->value; if (o->type == OBJ_SET) { - if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr; + if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr; } else if (o->type == OBJ_HASH) { - if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr; + if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr; } else if (o->type == OBJ_ZSET) { - if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->dict; + if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->ht; } else { errno = EINVAL; return 0; @@ -11110,7 +11191,7 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul int ret = 1; if (ht) { ScanKeyCBData data = {key, privdata, fn}; - cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, &data); + cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data); if (cursor->cursor == 0) { cursor->done = 1; ret = 0; @@ -12083,7 +12164,7 @@ int moduleFreeCommand(struct ValkeyModule *module, struct serverCommand *cmd) { if (cmd->subcommands_ht) { hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *sub = next; if (moduleFreeCommand(module, sub) != C_OK) continue; @@ -12106,7 +12187,7 @@ void moduleUnregisterCommands(struct ValkeyModule *module) { /* Unregister all the commands registered by this module. */ hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, server.commands); + hashtableInitIterator(&iter, server.commands, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; if (moduleFreeCommand(module, cmd) != C_OK) continue; @@ -13055,6 +13136,62 @@ int VM_RdbSave(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) { return VALKEYMODULE_OK; } +/* Registers a new scripting engine in the server. + * + * - `module_ctx`: the module context object. + * + * - `engine_name`: the name of the scripting engine. This name will match + * against the engine name specified in the script header using a shebang. + * + * - `engine_ctx`: engine specific context pointer. + * + * - `engine_methods`: the struct with the scripting engine callback functions + * pointers. + * + * Returns VALKEYMODULE_OK if the engine is successfully registered, and + * VALKEYMODULE_ERR in case some failure occurs. In case of a failure, an error + * message is logged. + */ +int VM_RegisterScriptingEngine(ValkeyModuleCtx *module_ctx, + const char *engine_name, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineMethods *engine_methods) { + serverLog(LL_DEBUG, "Registering a new scripting engine: %s", engine_name); + + if (engine_methods->version > VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION) { + serverLog(LL_WARNING, "The engine implementation version is greater " + "than what this server supports. Server ABI " + "Version: %lu, Engine ABI version: %lu", + VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + (unsigned long)engine_methods->version); + return VALKEYMODULE_ERR; + } + + if (scriptingEngineManagerRegister(engine_name, + module_ctx->module, + engine_ctx, + engine_methods) != C_OK) { + return VALKEYMODULE_ERR; + } + + return VALKEYMODULE_OK; +} + +/* Removes the scripting engine from the server. + * + * `engine_name` is the name of the scripting engine. + * + * Returns VALKEYMODULE_OK. + * + */ +int VM_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name) { + UNUSED(ctx); + if (scriptingEngineManagerUnregister(engine_name) != C_OK) { + return VALKEYMODULE_ERR; + } + return VALKEYMODULE_OK; +} + /* MODULE command. * * MODULE LIST @@ -13925,4 +14062,6 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(RdbStreamFree); REGISTER_API(RdbLoad); REGISTER_API(RdbSave); + REGISTER_API(RegisterScriptingEngine); + REGISTER_API(UnregisterScriptingEngine); } diff --git a/src/module.h b/src/module.h new file mode 100644 index 0000000000..f4e4de67eb --- /dev/null +++ b/src/module.h @@ -0,0 +1,233 @@ +#ifndef _MODULE_H_ +#define _MODULE_H_ + +/* This header file exposes a set of functions defined in module.c that are + * not part of the module API, but are used by the core to interact with modules + */ + +/* Extract encver / signature from a module type ID. */ +#define VALKEYMODULE_TYPE_ENCVER_BITS 10 +#define VALKEYMODULE_TYPE_ENCVER_MASK ((1 << VALKEYMODULE_TYPE_ENCVER_BITS) - 1) +#define VALKEYMODULE_TYPE_ENCVER(id) ((id) & VALKEYMODULE_TYPE_ENCVER_MASK) +#define VALKEYMODULE_TYPE_SIGN(id) \ + (((id) & ~((uint64_t)VALKEYMODULE_TYPE_ENCVER_MASK)) >> VALKEYMODULE_TYPE_ENCVER_BITS) + +/* Bit flags for moduleTypeAuxSaveFunc */ +#define VALKEYMODULE_AUX_BEFORE_RDB (1 << 0) +#define VALKEYMODULE_AUX_AFTER_RDB (1 << 1) + +struct ValkeyModule; +struct ValkeyModuleIO; +struct ValkeyModuleDigest; +struct ValkeyModuleCtx; +struct moduleLoadQueueEntry; +struct ValkeyModuleKeyOptCtx; +struct ValkeyModuleCommand; +struct clusterState; + +/* Each module type implementation should export a set of methods in order + * to serialize and deserialize the value in the RDB file, rewrite the AOF + * log, create the digest for "DEBUG DIGEST", and free the value when a key + * is deleted. */ +typedef void *(*moduleTypeLoadFunc)(struct ValkeyModuleIO *io, int encver); +typedef void (*moduleTypeSaveFunc)(struct ValkeyModuleIO *io, void *value); +typedef int (*moduleTypeAuxLoadFunc)(struct ValkeyModuleIO *rdb, int encver, int when); +typedef void (*moduleTypeAuxSaveFunc)(struct ValkeyModuleIO *rdb, int when); +typedef void (*moduleTypeRewriteFunc)(struct ValkeyModuleIO *io, struct serverObject *key, void *value); +typedef void (*moduleTypeDigestFunc)(struct ValkeyModuleDigest *digest, void *value); +typedef size_t (*moduleTypeMemUsageFunc)(const void *value); +typedef void (*moduleTypeFreeFunc)(void *value); +typedef size_t (*moduleTypeFreeEffortFunc)(struct serverObject *key, const void *value); +typedef void (*moduleTypeUnlinkFunc)(struct serverObject *key, void *value); +typedef void *(*moduleTypeCopyFunc)(struct serverObject *fromkey, struct serverObject *tokey, const void *value); +typedef int (*moduleTypeDefragFunc)(struct ValkeyModuleDefragCtx *ctx, struct serverObject *key, void **value); +typedef size_t (*moduleTypeMemUsageFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value, size_t sample_size); +typedef void (*moduleTypeFreeFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value); +typedef size_t (*moduleTypeFreeEffortFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value); +typedef void (*moduleTypeUnlinkFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value); +typedef void *(*moduleTypeCopyFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value); +typedef int (*moduleTypeAuthCallback)(struct ValkeyModuleCtx *ctx, void *username, void *password, const char **err); + + +/* The module type, which is referenced in each value of a given type, defines + * the methods and links to the module exporting the type. */ +typedef struct ValkeyModuleType { + uint64_t id; /* Higher 54 bits of type ID + 10 lower bits of encoding ver. */ + struct ValkeyModule *module; + moduleTypeLoadFunc rdb_load; + moduleTypeSaveFunc rdb_save; + moduleTypeRewriteFunc aof_rewrite; + moduleTypeMemUsageFunc mem_usage; + moduleTypeDigestFunc digest; + moduleTypeFreeFunc free; + moduleTypeFreeEffortFunc free_effort; + moduleTypeUnlinkFunc unlink; + moduleTypeCopyFunc copy; + moduleTypeDefragFunc defrag; + moduleTypeAuxLoadFunc aux_load; + moduleTypeAuxSaveFunc aux_save; + moduleTypeMemUsageFunc2 mem_usage2; + moduleTypeFreeEffortFunc2 free_effort2; + moduleTypeUnlinkFunc2 unlink2; + moduleTypeCopyFunc2 copy2; + moduleTypeAuxSaveFunc aux_save2; + int aux_save_triggers; + char name[10]; /* 9 bytes name + null term. Charset: A-Z a-z 0-9 _- */ +} moduleType; + +/* In Object 'robj' structures of type OBJ_MODULE, the value pointer + * is set to the following structure, referencing the moduleType structure + * in order to work with the value, and at the same time providing a raw + * pointer to the value, as created by the module commands operating with + * the module type. + * + * So for example in order to free such a value, it is possible to use + * the following code: + * + * if (robj->type == OBJ_MODULE) { + * moduleValue *mt = robj->ptr; + * mt->type->free(mt->value); + * zfree(mt); // We need to release this in-the-middle struct as well. + * } + */ +typedef struct moduleValue { + moduleType *type; + void *value; +} moduleValue; + +/* This structure represents a module inside the system. */ +typedef struct ValkeyModule { + void *handle; /* Module dlopen() handle. */ + char *name; /* Module name. */ + int ver; /* Module version. We use just progressive integers. */ + int apiver; /* Module API version as requested during initialization.*/ + list *types; /* Module data types. */ + list *usedby; /* List of modules using APIs from this one. */ + list *using; /* List of modules we use some APIs of. */ + list *filters; /* List of filters the module has registered. */ + list *module_configs; /* List of configurations the module has registered */ + int configs_initialized; /* Have the module configurations been initialized? */ + int in_call; /* RM_Call() nesting level */ + int in_hook; /* Hooks callback nesting level for this module (0 or 1). */ + int options; /* Module options and capabilities. */ + int blocked_clients; /* Count of ValkeyModuleBlockedClient in this module. */ + ValkeyModuleInfoFunc info_cb; /* Callback for module to add INFO fields. */ + ValkeyModuleDefragFunc defrag_cb; /* Callback for global data defrag. */ + struct moduleLoadQueueEntry *loadmod; /* Module load arguments for config rewrite. */ + int num_commands_with_acl_categories; /* Number of commands in this module included in acl categories */ + int onload; /* Flag to identify if the call is being made from Onload (0 or 1) */ + size_t num_acl_categories_added; /* Number of acl categories added by this module. */ +} ValkeyModule; + +/* This is a wrapper for the 'rio' streams used inside rdb.c in the server, so that + * the user does not have to take the total count of the written bytes nor + * to care about error conditions. */ +typedef struct ValkeyModuleIO { + size_t bytes; /* Bytes read / written so far. */ + rio *rio; /* Rio stream. */ + moduleType *type; /* Module type doing the operation. */ + int error; /* True if error condition happened. */ + ValkeyModuleCtx *ctx; /* Optional context, see RM_GetContextFromIO()*/ + robj *key; /* Optional name of key processed */ + int dbid; /* The dbid of the key being processed, -1 when unknown. */ + sds pre_flush_buffer; /* A buffer that should be flushed before next write operation + * See rdbSaveSingleModuleAux for more details */ +} ValkeyModuleIO; + +/* Macro to initialize an IO context. Note that the 'ver' field is populated + * inside rdb.c according to the version of the value to load. */ +static inline void moduleInitIOContext(ValkeyModuleIO *iovar, + moduleType *mtype, + rio *rioptr, + robj *keyptr, + int db) { + iovar->rio = rioptr; + iovar->type = mtype; + iovar->bytes = 0; + iovar->error = 0; + iovar->key = keyptr; + iovar->dbid = db; + iovar->ctx = NULL; + iovar->pre_flush_buffer = NULL; +} + +/* This is a structure used to export DEBUG DIGEST capabilities to + * modules. We want to capture both the ordered and unordered elements of + * a data structure, so that a digest can be created in a way that correctly + * reflects the values. See the DEBUG DIGEST command implementation for more + * background. */ +typedef struct ValkeyModuleDigest { + unsigned char o[20]; /* Ordered elements. */ + unsigned char x[20]; /* Xored elements. */ + robj *key; /* Optional name of key processed */ + int dbid; /* The dbid of the key being processed */ +} ValkeyModuleDigest; + +/* Just start with a digest composed of all zero bytes. */ +static inline void moduleInitDigestContext(ValkeyModuleDigest *mdvar) { + memset(mdvar->o, 0, sizeof(mdvar->o)); + memset(mdvar->x, 0, sizeof(mdvar->x)); +} + +void moduleEnqueueLoadModule(sds path, sds *argv, int argc); +sds moduleLoadQueueEntryToLoadmoduleOptionStr(ValkeyModule *module, + const char *config_option_str); +ValkeyModuleCtx *moduleAllocateContext(void); +void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx, + ValkeyModule *module, + client *client); +void moduleFreeContext(ValkeyModuleCtx *ctx); +void moduleInitModulesSystem(void); +void moduleInitModulesSystemLast(void); +void modulesCron(void); +int moduleLoad(const char *path, void **argv, int argc, int is_loadex); +int moduleUnload(sds name, const char **errmsg); +void moduleLoadFromQueue(void); +int moduleGetCommandKeysViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); +int moduleGetCommandChannelsViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); +moduleType *moduleTypeLookupModuleByID(uint64_t id); +moduleType *moduleTypeLookupModuleByName(const char *name); +moduleType *moduleTypeLookupModuleByNameIgnoreCase(const char *name); +void moduleTypeNameByID(char *name, uint64_t moduleid); +const char *moduleTypeModuleName(moduleType *mt); +const char *moduleNameFromCommand(struct serverCommand *cmd); +void moduleFreeContext(ValkeyModuleCtx *ctx); +void moduleCallCommandUnblockedHandler(client *c); +int isModuleClientUnblocked(client *c); +void unblockClientFromModule(client *c); +void moduleHandleBlockedClients(void); +void moduleBlockedClientTimedOut(client *c, int from_module); +void modulePipeReadable(aeEventLoop *el, int fd, void *privdata, int mask); +size_t moduleCount(void); +void moduleAcquireGIL(void); +int moduleTryAcquireGIL(void); +void moduleReleaseGIL(void); +void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid); +void firePostExecutionUnitJobs(void); +void moduleCallCommandFilters(client *c); +void modulePostExecutionUnitOperations(void); +void ModuleForkDoneHandler(int exitcode, int bysignal); +int TerminateModuleForkChild(int child_pid, int wait); +ssize_t rdbSaveModulesAux(rio *rdb, int when); +int moduleAllDatatypesHandleErrors(void); +int moduleAllModulesHandleReplAsyncLoad(void); +sds modulesCollectInfo(sds info, dict *sections_dict, int for_crash_report, int sections); +void moduleFireServerEvent(uint64_t eid, int subid, void *data); +void processModuleLoadingProgressEvent(int is_aof); +int moduleTryServeClientBlockedOnKey(client *c, robj *key); +void moduleUnblockClient(client *c); +int moduleBlockedClientMayTimeout(client *c); +int moduleClientIsBlockedOnKeys(client *c); +void moduleNotifyUserChanged(client *c); +void moduleNotifyKeyUnlink(robj *key, robj *val, int dbid, int flags); +size_t moduleGetFreeEffort(robj *key, robj *val, int dbid); +size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid); +robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value); +int moduleDefragValue(robj *key, robj *obj, int dbid); +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid); +void moduleDefragGlobals(void); +void *moduleGetHandleByName(char *modulename); +int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd); +void freeClientModuleData(client *c); + +#endif /* _MODULE_H_ */ diff --git a/src/multi.c b/src/multi.c index 9e1f019244..0318c418cc 100644 --- a/src/multi.c +++ b/src/multi.c @@ -33,33 +33,42 @@ /* Client state initialization for MULTI/EXEC */ void initClientMultiState(client *c) { - c->mstate.commands = NULL; - c->mstate.count = 0; - c->mstate.cmd_flags = 0; - c->mstate.cmd_inv_flags = 0; - c->mstate.argv_len_sums = 0; - c->mstate.alloc_count = 0; + if (c->mstate) return; + c->mstate = zcalloc(sizeof(multiState)); } -/* Release all the resources associated with MULTI/EXEC state */ -void freeClientMultiState(client *c) { - int j; - - for (j = 0; j < c->mstate.count; j++) { +void freeClientMultiStateCmds(client *c) { + for (int j = 0; j < c->mstate->count; j++) { int i; - multiCmd *mc = c->mstate.commands + j; + multiCmd *mc = c->mstate->commands + j; for (i = 0; i < mc->argc; i++) decrRefCount(mc->argv[i]); zfree(mc->argv); } - zfree(c->mstate.commands); + + zfree(c->mstate->commands); + c->mstate->commands = NULL; +} + +/* Release all the resources associated with MULTI/EXEC state */ +void freeClientMultiState(client *c) { + if (!c->mstate) return; + + freeClientMultiStateCmds(c); + unwatchAllKeys(c); + zfree(c->mstate); + c->mstate = NULL; } void resetClientMultiState(client *c) { - if (c->mstate.commands) { - freeClientMultiState(c); - initClientMultiState(c); - } + if (!c->mstate || !c->mstate->commands) return; + + freeClientMultiStateCmds(c); + c->mstate->count = 0; + c->mstate->cmd_flags = 0; + c->mstate->cmd_inv_flags = 0; + c->mstate->argv_len_sums = 0; + c->mstate->alloc_count = 0; } /* Add a new command into the MULTI commands queue */ @@ -71,26 +80,27 @@ void queueMultiCommand(client *c, uint64_t cmd_flags) { * bother to read previous responses and didn't notice the multi was already * aborted. */ if (c->flag.dirty_cas || c->flag.dirty_exec) return; - if (c->mstate.count == 0) { + if (!c->mstate) initClientMultiState(c); + if (c->mstate->count == 0) { /* If a client is using multi/exec, assuming it is used to execute at least * two commands. Hence, creating by default size of 2. */ - c->mstate.commands = zmalloc(sizeof(multiCmd) * 2); - c->mstate.alloc_count = 2; + c->mstate->commands = zmalloc(sizeof(multiCmd) * 2); + c->mstate->alloc_count = 2; } - if (c->mstate.count == c->mstate.alloc_count) { - c->mstate.alloc_count = c->mstate.alloc_count < INT_MAX / 2 ? c->mstate.alloc_count * 2 : INT_MAX; - c->mstate.commands = zrealloc(c->mstate.commands, sizeof(multiCmd) * (c->mstate.alloc_count)); + if (c->mstate->count == c->mstate->alloc_count) { + c->mstate->alloc_count = c->mstate->alloc_count < INT_MAX / 2 ? c->mstate->alloc_count * 2 : INT_MAX; + c->mstate->commands = zrealloc(c->mstate->commands, sizeof(multiCmd) * (c->mstate->alloc_count)); } - mc = c->mstate.commands + c->mstate.count; + mc = c->mstate->commands + c->mstate->count; mc->cmd = c->cmd; mc->argc = c->argc; mc->argv = c->argv; mc->argv_len = c->argv_len; - c->mstate.count++; - c->mstate.cmd_flags |= cmd_flags; - c->mstate.cmd_inv_flags |= ~cmd_flags; - c->mstate.argv_len_sums += c->argv_len_sum + sizeof(robj *) * c->argc; + c->mstate->count++; + c->mstate->cmd_flags |= cmd_flags; + c->mstate->cmd_inv_flags |= ~cmd_flags; + c->mstate->argv_len_sums += c->argv_len_sum + sizeof(robj *) * c->argc; /* Reset the client's args since we copied them into the mstate and shouldn't * reference them from c anymore. */ @@ -118,6 +128,7 @@ void flagTransaction(client *c) { } void multiCommand(client *c) { + if (!c->mstate) initClientMultiState(c); c->flag.multi = 1; addReply(c, shared.ok); } @@ -195,12 +206,12 @@ void execCommand(client *c) { orig_argv_len = c->argv_len; orig_argc = c->argc; orig_cmd = c->cmd; - addReplyArrayLen(c, c->mstate.count); - for (j = 0; j < c->mstate.count; j++) { - c->argc = c->mstate.commands[j].argc; - c->argv = c->mstate.commands[j].argv; - c->argv_len = c->mstate.commands[j].argv_len; - c->cmd = c->realcmd = c->mstate.commands[j].cmd; + addReplyArrayLen(c, c->mstate->count); + for (j = 0; j < c->mstate->count; j++) { + c->argc = c->mstate->commands[j].argc; + c->argv = c->mstate->commands[j].argv; + c->argv_len = c->mstate->commands[j].argv_len; + c->cmd = c->realcmd = c->mstate->commands[j].cmd; /* ACL permissions are also checked at the time of execution in case * they were changed after the commands were queued. */ @@ -234,12 +245,12 @@ void execCommand(client *c) { } /* Commands may alter argc/argv, restore mstate. */ - c->mstate.commands[j].argc = c->argc; - c->mstate.commands[j].argv = c->argv; - c->mstate.commands[j].argv_len = c->argv_len; - c->mstate.commands[j].cmd = c->cmd; + c->mstate->commands[j].argc = c->argc; + c->mstate->commands[j].argv = c->argv; + c->mstate->commands[j].argv_len = c->argv_len; + c->mstate->commands[j].cmd = c->cmd; - /* The original argv has already been processed for slowlog and monitor, + /* The original argv has already been processed for commandlog and monitor, * so we can safely free it before proceeding to the next command. */ freeClientOriginalArgv(c); } @@ -304,10 +315,10 @@ void watchForKey(client *c, robj *key) { listNode *ln; watchedKey *wk; - if (listLength(c->watched_keys) == 0) server.watching_clients++; + if (listLength(&c->mstate->watched_keys) == 0) server.watching_clients++; /* Check if we are already watching for this key */ - listRewind(c->watched_keys, &li); + listRewind(&c->mstate->watched_keys, &li); while ((ln = listNext(&li))) { wk = listNodeValue(ln); if (wk->db == c->db && equalStringObjects(key, wk->key)) return; /* Key already watched */ @@ -326,7 +337,7 @@ void watchForKey(client *c, robj *key) { wk->db = c->db; wk->expired = keyIsExpired(c->db, key); incrRefCount(key); - listAddNodeTail(c->watched_keys, wk); + listAddNodeTail(&c->mstate->watched_keys, wk); watchedKeyLinkToClients(clients, wk); } @@ -336,8 +347,8 @@ void unwatchAllKeys(client *c) { listIter li; listNode *ln; - if (listLength(c->watched_keys) == 0) return; - listRewind(c->watched_keys, &li); + if (!c->mstate || listLength(&c->mstate->watched_keys) == 0) return; + listRewind(&c->mstate->watched_keys, &li); while ((ln = listNext(&li))) { list *clients; watchedKey *wk; @@ -350,7 +361,7 @@ void unwatchAllKeys(client *c) { /* Kill the entry at all if this was the only client */ if (listLength(clients) == 0) dictDelete(wk->db->watched_keys, wk->key); /* Remove this watched key from the client->watched list */ - listDelNode(c->watched_keys, ln); + listDelNode(&c->mstate->watched_keys, ln); decrRefCount(wk->key); zfree(wk); } @@ -363,8 +374,8 @@ int isWatchedKeyExpired(client *c) { listIter li; listNode *ln; watchedKey *wk; - if (listLength(c->watched_keys) == 0) return 0; - listRewind(c->watched_keys, &li); + if (!c->mstate || listLength(&c->mstate->watched_keys) == 0) return 0; + listRewind(&c->mstate->watched_keys, &li); while ((ln = listNext(&li))) { wk = listNodeValue(ln); if (wk->expired) continue; /* was expired when WATCH was called */ @@ -474,6 +485,9 @@ void watchCommand(client *c) { addReply(c, shared.ok); return; } + + if (!c->mstate) initClientMultiState(c); + for (j = 1; j < c->argc; j++) watchForKey(c, c->argv[j]); addReply(c, shared.ok); } @@ -485,11 +499,12 @@ void unwatchCommand(client *c) { } size_t multiStateMemOverhead(client *c) { - size_t mem = c->mstate.argv_len_sums; + if (!c->mstate) return 0; + size_t mem = c->mstate->argv_len_sums; /* Add watched keys overhead, Note: this doesn't take into account the watched keys themselves, because they aren't * managed per-client. */ - mem += listLength(c->watched_keys) * (sizeof(listNode) + sizeof(watchedKey)); + mem += listLength(&c->mstate->watched_keys) * (sizeof(listNode) + sizeof(c->mstate->watched_keys)); /* Reserved memory for queued multi commands. */ - mem += c->mstate.alloc_count * sizeof(multiCmd); + mem += c->mstate->alloc_count * sizeof(multiCmd); return mem; } diff --git a/src/networking.c b/src/networking.c index 4d386d6dc4..093d579ef4 100644 --- a/src/networking.c +++ b/src/networking.c @@ -31,10 +31,12 @@ #include "cluster.h" #include "cluster_slot_stats.h" #include "script.h" +#include "intset.h" #include "sds.h" #include "fpconv_dtoa.h" #include "fmtargs.h" #include "io_threads.h" +#include "module.h" #include #include #include @@ -42,10 +44,35 @@ #include #include +/* This struct is used to encapsulate filtering criteria for operations on clients + * such as identifying specific clients to kill or retrieve. Each field in the struct + * represents a filter that can be applied based on specific attributes of a client. */ +typedef struct { + /* A set of client IDs to filter. If NULL, no ID filtering is applied. */ + intset *ids; + /* Maximum age (in seconds) of a client connection for filtering. + * Connections younger than this value will not match. + * A value of 0 means no age filtering. */ + long long max_age; + /* Address/port of the client. If NULL, no address filtering is applied. */ + char *addr; + /* Remote address/port of the client. If NULL, no address filtering is applied. */ + char *laddr; + /* Filtering clients by authentication user. If NULL, no user-based filtering is applied. */ + user *user; + /* Client type to filter. If set to -1, no type filtering is applied. */ + int type; + /* Boolean flag to determine if the current client (`me`) should be filtered. 1 means "skip me", 0 means otherwise. */ + int skipme; +} clientFilter; + static void setProtocolError(const char *errstr, client *c); static void pauseClientsByClient(mstime_t end, int isPauseClientAll); int postponeClientRead(client *c); char *getClientSockname(client *c); +static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter); +static int clientMatchesFilter(client *client, clientFilter client_filter); +static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data); int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */ __thread sds thread_shared_qb = NULL; @@ -118,7 +145,7 @@ int authRequired(client *c) { } static inline int isReplicaReadyForReplData(client *replica) { - return (replica->repl_state == REPLICA_STATE_ONLINE || replica->repl_state == REPLICA_STATE_BG_RDB_LOAD) && + return (replica->repl_data->repl_state == REPLICA_STATE_ONLINE || replica->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD) && !(replica->flag.close_asap); } @@ -134,6 +161,7 @@ client *createClient(connection *conn) { if (server.tcpkeepalive) connKeepAlive(conn, server.tcpkeepalive); connSetReadHandler(conn, readQueryFromClient); connSetPrivateData(conn, c); + conn->flags |= CONN_FLAG_ALLOW_ACCEPT_OFFLOAD; } c->buf = zmalloc_usable(PROTO_REPLY_CHUNK_BYTES, &c->buf_usable_size); selectDb(c, 0); @@ -152,8 +180,6 @@ client *createClient(connection *conn) { c->bufpos = 0; c->buf_peak = c->buf_usable_size; c->buf_peak_last_reset_time = server.unixtime; - c->ref_repl_buf_node = NULL; - c->ref_block_pos = 0; c->qb_pos = 0; c->querybuf = NULL; c->querybuf_peak = 0; @@ -178,55 +204,31 @@ client *createClient(connection *conn) { c->ctime = c->last_interaction = server.unixtime; c->duration = 0; clientSetDefaultAuth(c); - c->repl_state = REPL_STATE_NONE; - c->repl_start_cmd_stream_on_ack = 0; - c->reploff = 0; - c->read_reploff = 0; - c->repl_applied = 0; - c->repl_ack_off = 0; - c->repl_ack_time = 0; - c->repl_aof_off = 0; - c->repl_last_partial_write = 0; - c->replica_listening_port = 0; - c->replica_addr = NULL; - c->replica_version = 0; - c->replica_capa = REPLICA_CAPA_NONE; - c->replica_req = REPLICA_REQ_NONE; - c->associated_rdb_client_id = 0; - c->rdb_client_disconnect_time = 0; c->reply = listCreate(); c->deferred_reply_errors = NULL; c->reply_bytes = 0; c->obuf_soft_limit_reached_time = 0; listSetFreeMethod(c->reply, freeClientReplyValue); listSetDupMethod(c->reply, dupClientReplyValue); - initClientBlockingState(c); + c->repl_data = NULL; + c->bstate = NULL; + c->pubsub_data = NULL; + c->module_data = NULL; + c->mstate = NULL; c->woff = 0; - c->watched_keys = listCreate(); - c->pubsub_channels = dictCreate(&objectKeyPointerValueDictType); - c->pubsub_patterns = dictCreate(&objectKeyPointerValueDictType); - c->pubsubshard_channels = dictCreate(&objectKeyPointerValueDictType); c->peerid = NULL; c->sockname = NULL; c->client_list_node = NULL; c->io_read_state = CLIENT_IDLE; c->io_write_state = CLIENT_IDLE; c->nwritten = 0; - c->client_tracking_redirection = 0; - c->client_tracking_prefixes = NULL; c->last_memory_usage = 0; c->last_memory_type = CLIENT_TYPE_NORMAL; - c->module_blocked_client = NULL; - c->module_auth_ctx = NULL; - c->auth_callback = NULL; - c->auth_callback_privdata = NULL; - c->auth_module = NULL; listInitNode(&c->clients_pending_write_node, c); listInitNode(&c->pending_read_list_node, c); c->mem_usage_bucket = NULL; c->mem_usage_bucket_node = NULL; if (conn) linkClient(c); - initClientMultiState(c); c->net_input_bytes = 0; c->net_input_bytes_curr_cmd = 0; c->net_output_bytes = 0; @@ -264,7 +266,9 @@ void putClientInPendingWriteQueue(client *c) { * if not already done and, for replicas, if the replica can actually receive * writes at this stage. */ if (!c->flag.pending_write && - (c->repl_state == REPL_STATE_NONE || (isReplicaReadyForReplData(c) && !c->repl_start_cmd_stream_on_ack))) { + (!c->repl_data || + c->repl_data->repl_state == REPL_STATE_NONE || + (isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack))) { /* Here instead of installing the write handler, we just flag the * client and put it into a list of clients that have something * to write to the socket. This way before re-entering the event @@ -556,7 +560,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { if (c->flag.module) { if (!c->deferred_reply_errors) { c->deferred_reply_errors = listCreate(); - listSetFreeMethod(c->deferred_reply_errors, (void (*)(void *))sdsfree); + listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid); } listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len)); return; @@ -1338,10 +1342,10 @@ void deferredAfterErrorReply(client *c, list *errors) { void copyReplicaOutputBuffer(client *dst, client *src) { serverAssert(src->bufpos == 0 && listLength(src->reply) == 0); - if (src->ref_repl_buf_node == NULL) return; - dst->ref_repl_buf_node = src->ref_repl_buf_node; - dst->ref_block_pos = src->ref_block_pos; - ((replBufBlock *)listNodeValue(dst->ref_repl_buf_node))->refcount++; + if (src->repl_data->ref_repl_buf_node == NULL) return; + dst->repl_data->ref_repl_buf_node = src->repl_data->ref_repl_buf_node; + dst->repl_data->ref_block_pos = src->repl_data->ref_block_pos; + ((replBufBlock *)listNodeValue(dst->repl_data->ref_repl_buf_node))->refcount++; } /* Return true if the specified client has pending reply buffers to write to @@ -1351,13 +1355,13 @@ int clientHasPendingReplies(client *c) { /* Replicas use global shared replication buffer instead of * private output buffer. */ serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); - if (c->ref_repl_buf_node == NULL) return 0; + if (c->repl_data->ref_repl_buf_node == NULL) return 0; /* If the last replication buffer block content is totally sent, * we have nothing to send. */ listNode *ln = listLast(server.repl_buffer_blocks); replBufBlock *tail = listNodeValue(ln); - if (ln == c->ref_repl_buf_node && c->ref_block_pos == tail->used) return 0; + if (ln == c->repl_data->ref_repl_buf_node && c->repl_data->ref_block_pos == tail->used) return 0; return 1; } else { @@ -1524,23 +1528,6 @@ void disconnectReplicas(void) { } } -/* Check if there is any other replica waiting dumping RDB finished expect me. - * This function is useful to judge current dumping RDB can be used for full - * synchronization or not. */ -int anyOtherReplicaWaitRdb(client *except_me) { - listIter li; - listNode *ln; - - listRewind(server.replicas, &li); - while ((ln = listNext(&li))) { - client *replica = ln->value; - if (replica != except_me && replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) { - return 1; - } - } - return 0; -} - /* Remove the specified client from global lists where the client could * be referenced, not including the Pub/Sub channels. * This is used by freeClient() and replicationCachePrimary(). */ @@ -1565,7 +1552,7 @@ void unlinkClient(client *c) { /* Check if this is a replica waiting for diskless replication (rdb pipe), * in which case it needs to be cleaned from that list */ - if (c->flag.replica && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) { + if (c->repl_data && c->flag.replica && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) { int i; int still_alive = 0; for (i = 0; i < server.rdb_pipe_numconns; i++) { @@ -1651,11 +1638,7 @@ void clearClientConnectionState(client *c) { clientSetDefaultAuth(c); moduleNotifyUserChanged(c); discardTransaction(c); - - pubsubUnsubscribeAllChannels(c, 0); - pubsubUnsubscribeShardAllChannels(c, 0); - pubsubUnsubscribeAllPatterns(c, 0); - unmarkClientAsPubSub(c); + freeClientPubSubData(c); if (c->name) { decrRefCount(c->name); @@ -1694,9 +1677,7 @@ void freeClient(client *c) { /* Notify module system that this client auth status changed. */ moduleNotifyUserChanged(c); - - /* Free the RedisModuleBlockedClient held onto for reprocessing if not already freed. */ - zfree(c->module_blocked_client); + freeClientModuleData(c); /* If this client was scheduled for async freeing we need to remove it * from the queue. Note that we need to do this here, because later @@ -1743,31 +1724,16 @@ void freeClient(client *c) { /* If there is any in-flight command, we don't record their duration. */ c->duration = 0; if (c->flag.blocked) unblockClient(c, 1); - dictRelease(c->bstate.keys); - - /* UNWATCH all the keys */ - unwatchAllKeys(c); - listRelease(c->watched_keys); - c->watched_keys = NULL; - - /* Unsubscribe from all the pubsub channels */ - pubsubUnsubscribeAllChannels(c, 0); - pubsubUnsubscribeShardAllChannels(c, 0); - pubsubUnsubscribeAllPatterns(c, 0); - unmarkClientAsPubSub(c); - dictRelease(c->pubsub_channels); - c->pubsub_channels = NULL; - dictRelease(c->pubsub_patterns); - c->pubsub_patterns = NULL; - dictRelease(c->pubsubshard_channels); - c->pubsubshard_channels = NULL; + + freeClientBlockingState(c); + freeClientPubSubData(c); /* Free data structures. */ listRelease(c->reply); c->reply = NULL; zfree_with_size(c->buf, c->buf_usable_size); c->buf = NULL; - freeReplicaReferencedReplBuffer(c); + freeClientArgv(c); freeClientOriginalArgv(c); if (c->deferred_reply_errors) listRelease(c->deferred_reply_errors); @@ -1785,45 +1751,7 @@ void freeClient(client *c) { * places where active clients may be referenced. */ unlinkClient(c); - /* Primary/replica cleanup Case 1: - * we lost the connection with a replica. */ - if (c->flag.replica) { - /* If there is no any other replica waiting dumping RDB finished, the - * current child process need not continue to dump RDB, then we kill it. - * So child process won't use more memory, and we also can fork a new - * child process asap to dump rdb for next full synchronization or bgsave. - * But we also need to check if users enable 'save' RDB, if enable, we - * should not remove directly since that means RDB is important for users - * to keep data safe and we may delay configured 'save' for full sync. */ - if (server.saveparamslen == 0 && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && - server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK && - anyOtherReplicaWaitRdb(c) == 0) { - serverLog(LL_NOTICE, "Background saving, persistence disabled, last replica dropped, killing fork child."); - killRDBChild(); - } - if (c->repl_state == REPLICA_STATE_SEND_BULK) { - if (c->repldbfd != -1) close(c->repldbfd); - if (c->replpreamble) sdsfree(c->replpreamble); - } - list *l = (c->flag.monitor) ? server.monitors : server.replicas; - ln = listSearchKey(l, c); - serverAssert(ln != NULL); - listDelNode(l, ln); - /* We need to remember the time when we started to have zero - * attached replicas, as after some time we'll free the replication - * backlog. */ - if (getClientType(c) == CLIENT_TYPE_REPLICA && listLength(server.replicas) == 0) - server.repl_no_replicas_since = server.unixtime; - refreshGoodReplicasCount(); - /* Fire the replica change modules event. */ - if (c->repl_state == REPLICA_STATE_ONLINE) - moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE, - NULL); - } - - /* Primary/replica cleanup Case 2: - * we lost the connection with the primary. */ - if (c->flag.primary) replicationHandlePrimaryDisconnection(); + freeClientReplicationData(c); /* Remove client from memory usage buckets */ if (c->mem_usage_bucket) { @@ -1839,7 +1767,6 @@ void freeClient(client *c) { freeClientMultiState(c); sdsfree(c->peerid); sdsfree(c->sockname); - sdsfree(c->replica_addr); zfree(c); } @@ -1930,10 +1857,10 @@ void beforeNextClient(client *c) { * In these scenarios, qb_pos points to the part of the current command * or the beginning of next command, and the current command is not applied yet, * so the repl_applied is not equal to qb_pos. */ - if (c->repl_applied) { - sdsrange(c->querybuf, c->repl_applied, -1); - c->qb_pos -= c->repl_applied; - c->repl_applied = 0; + if (c->repl_data->repl_applied) { + sdsrange(c->querybuf, c->repl_data->repl_applied, -1); + c->qb_pos -= c->repl_data->repl_applied; + c->repl_data->repl_applied = 0; } } else { trimClientQueryBuffer(c); @@ -1972,18 +1899,18 @@ int freeClientsInAsyncFreeQueue(void) { * The primary gives a grace period before freeing this client because * it serves as a reference to the first required replication data block for * this replica */ - if (!c->rdb_client_disconnect_time) { + if (!c->repl_data->rdb_client_disconnect_time) { if (c->conn) connSetReadHandler(c->conn, NULL); - c->rdb_client_disconnect_time = server.unixtime; + c->repl_data->rdb_client_disconnect_time = server.unixtime; dualChannelServerLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", (unsigned long long)c->id, replicationGetReplicaName(c), server.wait_before_rdb_client_free); } - if (server.unixtime - c->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue; + if (server.unixtime - c->repl_data->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue; dualChannelServerLog( LL_NOTICE, "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " "Freeing RDB client %llu.", - (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); + (long int)(server.unixtime - c->repl_data->rdb_client_disconnect_time), (unsigned long long)c->id); c->flag.protected_rdb_channel = 0; } @@ -2013,27 +1940,27 @@ void writeToReplica(client *c) { int nwritten = 0; serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); while (clientHasPendingReplies(c)) { - replBufBlock *o = listNodeValue(c->ref_repl_buf_node); - serverAssert(o->used >= c->ref_block_pos); + replBufBlock *o = listNodeValue(c->repl_data->ref_repl_buf_node); + serverAssert(o->used >= c->repl_data->ref_block_pos); /* Send current block if it is not fully sent. */ - if (o->used > c->ref_block_pos) { - nwritten = connWrite(c->conn, o->buf + c->ref_block_pos, o->used - c->ref_block_pos); + if (o->used > c->repl_data->ref_block_pos) { + nwritten = connWrite(c->conn, o->buf + c->repl_data->ref_block_pos, o->used - c->repl_data->ref_block_pos); if (nwritten <= 0) { c->write_flags |= WRITE_FLAGS_WRITE_ERROR; return; } c->nwritten += nwritten; - c->ref_block_pos += nwritten; + c->repl_data->ref_block_pos += nwritten; } /* If we fully sent the object on head, go to the next one. */ - listNode *next = listNextNode(c->ref_repl_buf_node); - if (next && c->ref_block_pos == o->used) { + listNode *next = listNextNode(c->repl_data->ref_repl_buf_node); + if (next && c->repl_data->ref_block_pos == o->used) { o->refcount--; ((replBufBlock *)(listNodeValue(next)))->refcount++; - c->ref_repl_buf_node = next; - c->ref_block_pos = 0; + c->repl_data->ref_repl_buf_node = next; + c->repl_data->ref_block_pos = 0; incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); } } @@ -2230,6 +2157,8 @@ int postWriteToClient(client *c) { server.stat_total_writes_processed++; if (getClientType(c) != CLIENT_TYPE_REPLICA) { _postWriteToClient(c); + } else { + server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0; } if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) { @@ -2334,7 +2263,7 @@ int handleReadResult(client *c) { c->last_interaction = server.unixtime; c->net_input_bytes += c->nread; if (c->flag.primary) { - c->read_reploff += c->nread; + c->repl_data->read_reploff += c->nread; server.stat_net_repl_input_bytes += c->nread; } else { server.stat_net_input_bytes += c->nread; @@ -2405,7 +2334,7 @@ parseResult handleParseResults(client *c) { } if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN && getClientType(c) == CLIENT_TYPE_REPLICA) { - c->repl_ack_time = server.unixtime; + c->repl_data->repl_ack_time = server.unixtime; } if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN) { @@ -2548,6 +2477,7 @@ int handleClientsWithPendingWrites(void) { /* resetClient prepare the client to process the next command */ void resetClient(client *c) { serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL; + serverCommandProc *prevParentCmd = c->cmd && c->cmd->parent ? c->cmd->parent->proc : NULL; freeClientArgv(c); freeClientOriginalArgv(c); @@ -2577,7 +2507,7 @@ void resetClient(client *c) { /* We do the same for the CACHING command as well. It also affects * the next command or transaction executed, in a way very similar * to ASKING. */ - if (!c->flag.multi && prevcmd != clientCommand) c->flag.tracking_caching = 0; + if (!c->flag.multi && prevParentCmd != clientCommand) c->flag.tracking_caching = 0; /* Remove the CLIENT_REPLY_SKIP flag if any so that the reply * to the next command will be sent, but set the flag if the command @@ -2589,6 +2519,16 @@ void resetClient(client *c) { } } +void resetClientIOState(client *c) { + c->nwritten = 0; + c->nread = 0; + c->io_read_state = c->io_write_state = CLIENT_IDLE; + c->io_parsed_cmd = NULL; + c->flag.pending_command = 0; + c->io_last_bufpos = 0; + c->io_last_reply_block = NULL; +} + /* Initializes the shared query buffer to a new sds with the default capacity. * Need to ensure the initlen is not less than readlen in readToQueryBuf. */ void initSharedQueryBuf(void) { @@ -2979,10 +2919,12 @@ void commandProcessed(client *c) { clusterSlotStatsAddNetworkBytesInForUserClient(c); resetClient(c); - long long prev_offset = c->reploff; + if (!c->repl_data) return; + + long long prev_offset = c->repl_data->reploff; if (c->flag.primary && !c->flag.multi) { /* Update the applied replication offset of our primary. */ - c->reploff = c->read_reploff - sdslen(c->querybuf) + c->qb_pos; + c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos; } /* If the client is a primary we need to compute the difference @@ -2992,10 +2934,10 @@ void commandProcessed(client *c) { * part of the replication stream, will be propagated to the * sub-replicas and to the replication backlog. */ if (c->flag.primary) { - long long applied = c->reploff - prev_offset; + long long applied = c->repl_data->reploff - prev_offset; if (applied) { - replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_applied, applied); - c->repl_applied += applied; + replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied); + c->repl_data->repl_applied += applied; } } } @@ -3227,7 +3169,7 @@ void readToQueryBuf(client *c) { * so they are also considered a part of the query buffer in a broader sense. * * For unauthenticated clients, the query buffer cannot exceed 1MB at most. */ - size_t qb_memory = sdslen(c->querybuf) + c->mstate.argv_len_sums; + size_t qb_memory = sdslen(c->querybuf) + (c->mstate ? c->mstate->argv_len_sums : 0); if (qb_memory > server.client_max_querybuf_len || (qb_memory > 1024 * 1024 && (c->read_flags & READ_FLAGS_AUTH_REQUIRED))) { c->read_flags |= READ_FLAGS_QB_LIMIT_REACHED; @@ -3355,9 +3297,9 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { size_t obufmem, total_mem = getClientMemoryUsage(client, &obufmem); size_t used_blocks_of_repl_buf = 0; - if (client->ref_repl_buf_node) { + if (client->repl_data && client->repl_data->ref_repl_buf_node) { replBufBlock *last = listNodeValue(listLast(server.repl_buffer_blocks)); - replBufBlock *cur = listNodeValue(client->ref_repl_buf_node); + replBufBlock *cur = listNodeValue(client->repl_data->ref_repl_buf_node); used_blocks_of_repl_buf = last->id - cur->id + 1; } sds ret = sdscatfmt( @@ -3372,15 +3314,15 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { " idle=%I", (long long)(server.unixtime - client->last_interaction), " flags=%s", flags, " db=%i", client->db->id, - " sub=%i", (int)dictSize(client->pubsub_channels), - " psub=%i", (int)dictSize(client->pubsub_patterns), - " ssub=%i", (int)dictSize(client->pubsubshard_channels), - " multi=%i", (client->flag.multi) ? client->mstate.count : -1, - " watch=%i", (int)listLength(client->watched_keys), + " sub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsub_channels) : 0, + " psub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsub_patterns) : 0, + " ssub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsubshard_channels) : 0, + " multi=%i", client->mstate ? client->mstate->count : -1, + " watch=%i", client->mstate ? (int)listLength(&client->mstate->watched_keys) : 0, " qbuf=%U", client->querybuf ? (unsigned long long)sdslen(client->querybuf) : 0, " qbuf-free=%U", client->querybuf ? (unsigned long long)sdsavail(client->querybuf) : 0, " argv-mem=%U", (unsigned long long)client->argv_len_sum, - " multi-mem=%U", (unsigned long long)client->mstate.argv_len_sums, + " multi-mem=%U", client->mstate ? (unsigned long long)client->mstate->argv_len_sums : 0, " rbs=%U", (unsigned long long)client->buf_usable_size, " rbp=%U", (unsigned long long)client->buf_peak, " obl=%U", (unsigned long long)client->bufpos, @@ -3390,7 +3332,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { " events=%s", events, " cmd=%s", client->lastcmd ? client->lastcmd->fullname : "NULL", " user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"), - " redir=%I", (client->flag.tracking) ? (long long)client->client_tracking_redirection : -1, + " redir=%I", (client->flag.tracking) ? (long long)client->pubsub_data->client_tracking_redirection : -1, " resp=%i", client->resp, " lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "", " lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : "", @@ -3439,6 +3381,22 @@ sds getAllClientsInfoString(int type, int hide_user_data) { return o; } +static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data) { + listNode *ln; + listIter li; + client *client; + sds o = sdsempty(); + sdsclear(o); + listRewind(server.clients, &li); + while ((ln = listNext(&li)) != NULL) { + client = listNodeValue(ln); + if (!clientMatchesFilter(client, *client_filter)) continue; + o = catClientInfoString(o, client, hide_user_data); + o = sdscatlen(o, "\n", 1); + } + return o; +} + /* Check validity of an attribute that's gonna be shown in CLIENT LIST. */ int validateClientAttr(const char *val) { /* Check if the charset is ok. We need to do this otherwise @@ -3558,569 +3516,648 @@ void quitCommand(client *c) { c->flag.close_after_reply = 1; } -void clientCommand(client *c) { - listNode *ln; - listIter li; +static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter) { + while (index < c->argc) { + int moreargs = c->argc > index + 1; - if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { - const char *help[] = { - "CACHING (YES|NO)", - " Enable/disable tracking of the keys for next command in OPTIN/OPTOUT modes.", - "CAPA