From 2e31f0b912be65c30a326bd2f9c6465d806678f5 Mon Sep 17 00:00:00 2001 From: Henrique de Carvalho Date: Tue, 24 Sep 2024 18:47:12 -0300 Subject: [PATCH] WIP: Add new custom event loop for I/O layer Introduce ev.h and ev.c, establishing the foundation for the new custom event loop, `pgagroal_ev`. Replace previous dependencies on libev with the custom event loop. For Linux, implement support for io_uring with a fallback to epoll if io_uring is unavailable. For BSD, implement support for kqueue. Changelog ========= 2025-01-17: - Fix regression added to the Debug build. `waitpid()` inside `#if DEBUG` would block waiting for the running child processes. Add WNOHANG to the function options so the main process does not block. - Remove '\n' from `pagroal_log_*()` - Run uncrustify.sh - Remove `printf()` debugging - Modify CD/CI pipeline to improve function verify_running and enable the CD/CI for kqueue, which was disabled. - Change `pgagroal_log_error()` that informs errors in `kill()` to `pgagroal_log_debug()`. I suppose these errors should happen if pgagroal is not closely following up the death of its children. Therefore, there is no need to flood the log with error messages. Keep as debug because it is informative of something that could be improved. 2025-01-16: - Remove `pgagroal_ev_*_stop()` functions from main.c and vault.c as these functions are not intended to be called from a child process inheriting the ev loop. This does not make sense in io_uring and for epoll and kqueue, just closing the fds is enough as a precaution to prevent their use. - Enhance debugging steps in `sigchld_handler()` to help debug child processes termination. - Add error handling to `kill()` to help debug child processes termination. - Allow return error on `pgagroal_ev_io_stop()` instead of exit so the caller function can handle errors appropriately. - Add a call to `setpgid()` when worker is created to ensure child processes do not exit immediately if a SIGINT is issued in the controlling terminal. - Add a call to `pgagroal_ev_fork()` when a worker is created. 2025-01-13: - Fix ASan report in `pgagroal_ev_io_stop()` --- .github/ISSUE_TEMPLATE/bug_report.md | 4 +- .github/config/pg_hba.conf | 4 + .github/workflows/ci.yml | 245 ++- CMakeLists.txt | 19 +- CONTRIBUTING.md | 2 +- README.md | 16 +- cmake/FindLibev.cmake | 38 - cmake/FindLiburing.cmake | 18 + doc/ARCHITECTURE.md | 18 +- doc/CONFIGURATION.md | 2 +- doc/DEVELOPERS.md | 2 +- doc/etc/pgagroal.conf | 1 + doc/man/pgagroal.conf.5.rst | 4 +- doc/manual/02-installation.md | 6 +- doc/manual/99-references.md | 2 +- doc/manual/dev-02-architecture.md | 13 +- doc/manual/user-02-configuration.md | 2 +- pgagroal.spec | 4 +- src/CMakeLists.txt | 17 +- src/bidi.c | 2172 ++++++++++++++++++++++++ src/include/configuration.h | 2 +- src/include/ev.h | 413 +++++ src/include/message.h | 9 + src/include/network.h | 1 + src/include/pgagroal.h | 3 +- src/include/pipeline.h | 2 +- src/include/utils.h | 25 +- src/include/worker.h | 2 +- src/libpgagroal/configuration.c | 204 ++- src/libpgagroal/ev.c | 1725 +++++++++++++++++++ src/libpgagroal/message.c | 29 + src/libpgagroal/network.c | 2 + src/libpgagroal/pipeline_perf.c | 57 +- src/libpgagroal/pipeline_session.c | 58 +- src/libpgagroal/pipeline_transaction.c | 80 +- src/libpgagroal/security.c | 1 + src/libpgagroal/utils.c | 163 +- src/libpgagroal/worker.c | 51 +- src/main.c | 168 +- src/vault.c | 50 +- 40 files changed, 5033 insertions(+), 601 deletions(-) create mode 100644 .github/config/pg_hba.conf delete mode 100644 cmake/FindLibev.cmake create mode 100644 cmake/FindLiburing.cmake create mode 100644 src/bidi.c create mode 100644 src/include/ev.h create mode 100644 src/libpgagroal/ev.c diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 10f6ea02..e2240b05 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -23,9 +23,9 @@ What is the version of pgagroal ? What is the version of PostgreSQL ? -**libev** +**liburing** -What is the version of libev ? +What is the version of liburing ? **OpenSSL** diff --git a/.github/config/pg_hba.conf b/.github/config/pg_hba.conf new file mode 100644 index 00000000..21ad7f6a --- /dev/null +++ b/.github/config/pg_hba.conf @@ -0,0 +1,4 @@ +local all all trust +host all all all trust +local replication all peer +host replication all all trust diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1078612..e97e7880 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ on: jobs: build-linux: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 @@ -21,8 +21,8 @@ jobs: sudo wget --quiet --output-document /etc/apt/trusted.gpg.d/apt.postgresql.org.asc https://www.postgresql.org/media/keys/ACCC4CF8.asc - name: Update system run: sudo apt update - - name: Install libev - run: sudo apt install -y libev4 libev-dev + - name: Install liburing + run: sudo apt install -y liburing-dev - name: Install systemd run: sudo apt install -y libsystemd-dev - name: Install rst2man @@ -37,14 +37,43 @@ jobs: run: sudo apt install graphviz - name: Install doxygen run: sudo apt install doxygen + - name: Install crudini + run: sudo apt install -y crudini - name: Install clang run: sudo apt install -y clang - name: Install PostgreSQL - run: sudo apt install -y postgresql - - name: Start postgres + run: sudo apt install -y postgresql postgresql-contrib + - name: Start postgres & setup test table run: | version=$(pg_config --version | grep -Eo "[0-9]{1,2}" | head -1) - sudo -u postgres /usr/lib/postgresql/${version}/bin/pg_ctl start -D /etc/postgresql/${version}/main/ + sudo cp pg_hba.conf /etc/postgresql/${version}/main + sudo -u postgres /usr/lib/postgresql/${version}/bin/pg_ctl start -D /etc/postgresql/${version}/main/ -o "-p 5432" + netstat -tuln | grep '127.0.0.1:5432' || (echo "Nothing is listening on 127.0.0.1:5432"; exit 1) + netstat -tuln | grep '::1:5432' || (echo "Nothing is listening on ::1:5432"; exit 1) + PGPASSWORD="postgres" pgbench -i -s 1 -h localhost -p 5432 -U postgres -d postgres + working-directory: /home/runner/work/pgagroal/pgagroal/.github/config/ + - name: Define functions `verify_running` and `verify_shutdown` + run: | + echo 'verify_running() { + echo "Confirming pgagroal is listening on port 2345" + netstat -tuln | grep "127.0.0.1:2345" || (echo "Nothing is listening on 127.0.0.1:2345"; exit 1) + netstat -tuln | grep "::1:2345" || (echo "Nothing is listening on ::1:2345"; exit 1) + echo "[*] Running pgagroal-cli ping" + ./pgagroal-cli ping + echo "[*] Running queries with psql" + PGPASSWORD="postgres" psql -h 127.0.0.1 -p 2345 -U postgres -d postgres -c "SELECT * FROM pgbench_accounts LIMIT 50;" > /dev/null + PGPASSWORD="postgres" psql -h ::1 -p 2345 -U postgres -d postgres -c "SELECT * FROM pgbench_accounts LIMIT 50;" > /dev/null + } + + verify_shutdown() { + echo "[*] Running pgagroal-cli shutdown immediate" + ./pgagroal-cli shutdown immediate + sleep 5 + echo "[*] Confirming there are no dangling pgagroal processes" + pgrep pgagroal > /dev/null && echo "[E] Dangling pgagroal child processes: $(wc -l < <(pgrep pgagroal))" && exit 1 + echo "rm -f /tmp/pgagroal.2345.pid" + rm -f /tmp/pgagroal.2345.pid + }' > /tmp/functions.sh - name: GCC/mkdir run: mkdir build working-directory: /home/runner/work/pgagroal/pgagroal/ @@ -54,28 +83,52 @@ jobs: - name: GCC/make run: make working-directory: /home/runner/work/pgagroal/pgagroal/build/ - - name: GCC/Run pgagroal & confirm pgagroal is running + - name: GCC/Run pgagroal as daemon with 'io_uring' backend run: | sudo mkdir -p /etc/pgagroal + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_type file + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_path /dev/null + crudini --set ../../doc/etc/pgagroal.conf pgagroal ev_backend io_uring + echo "host all all all trust" > ../../doc/etc/pgagroal_hba.conf sudo cp ../../doc/etc/*.conf /etc/pgagroal - ./pgagroal >> /dev/null 2>&1 & - pid=$! + sudo sysctl kernel.io_uring_disabled=0 + ./pgagroal -d || exit 1 sleep 5 - ./pgagroal-cli ping working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ - - name: GCC/Stop pgagroal & postgres + - name: GCC/Run verify_running for 'io_uring' backend run: | - ./pgagroal-cli shutdown - version=$(pg_config --version | grep -Eo "[0-9]{1,2}" | head -1) - sudo -u postgres /usr/lib/postgresql/${version}/bin/pg_ctl stop -D /etc/postgresql/${version}/main/ + source /tmp/functions.sh + verify_running + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: GCC/Run verify_shutdown for 'io_uring' backend + run: | + source /tmp/functions.sh + verify_shutdown + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: GCC/Run pgagroal as daemon with 'epoll' backend + run: | + sudo mkdir -p /etc/pgagroal + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_type file + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_path /dev/null + crudini --set ../../doc/etc/pgagroal.conf pgagroal ev_backend epoll + echo "host all all all trust" > ../../doc/etc/pgagroal_hba.conf + sudo cp ../../doc/etc/*.conf /etc/pgagroal + ./pgagroal -d || exit + sleep 5 + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: GCC/Run verify_running for 'epoll' backend + run: | + source /tmp/functions.sh + verify_running + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: GCC/Run verify_shutdown for 'epoll' backend + run: | + source /tmp/functions.sh + verify_shutdown working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ - name: rm -Rf run: rm -Rf build/ working-directory: /home/runner/work/pgagroal/pgagroal/ - - name: Start postgres - run: | - version=$(pg_config --version | grep -Eo "[0-9]{1,2}" | head -1) - sudo -u postgres /usr/lib/postgresql/${version}/bin/pg_ctl start -D /etc/postgresql/${version}/main/ - name: CLANG/mkdir run: mkdir build working-directory: /home/runner/work/pgagroal/pgagroal/ @@ -85,38 +138,60 @@ jobs: - name: CLANG/make run: make working-directory: /home/runner/work/pgagroal/pgagroal/build/ - - name: CLANG/Run pgagroal & confirm pgagroal is running + - name: CLANG/Run pgagroal as daemon with 'io_uring' backend run: | sudo mkdir -p /etc/pgagroal + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_type file + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_path /dev/null + crudini --set ../../doc/etc/pgagroal.conf pgagroal ev_backend io_uring + echo "host all all all trust" > ../../doc/etc/pgagroal_hba.conf sudo cp ../../doc/etc/*.conf /etc/pgagroal - ./pgagroal >> /dev/null 2>&1 & - pid=$! + sudo sysctl kernel.io_uring_disabled=0 + ./pgagroal -d || exit 1 sleep 5 - ./pgagroal-cli ping working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ - - name: CLANG/Stop pgagroal & postgres + - name: CLANG/Run verify_running for 'io_uring' backend run: | - ./pgagroal-cli shutdown - version=$(pg_config --version | grep -Eo "[0-9]{1,2}" | head -1) - sudo -u postgres /usr/lib/postgresql/${version}/bin/pg_ctl stop -D /etc/postgresql/${version}/main/ + source /tmp/functions.sh + verify_running + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: CLANG/Run verify_shutdown for 'io_uring' backend + run: | + source /tmp/functions.sh + verify_shutdown + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: CLANG/Run pgagroal as daemon with 'epoll' backend + run: | + sudo mkdir -p /etc/pgagroal + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_type file + crudini --set ../../doc/etc/pgagroal.conf pgagroal log_path /dev/null + crudini --set ../../doc/etc/pgagroal.conf pgagroal ev_backend epoll + echo "host all all all trust" > ../../doc/etc/pgagroal_hba.conf + sudo cp ../../doc/etc/*.conf /etc/pgagroal + ./pgagroal -d || exit 1 + sleep 5 + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: CLANG/Run verify_running for 'epoll' backend + run: | + source /tmp/functions.sh + verify_running + working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ + - name: CLANG/Run verify_shutdown for 'epoll' backend + run: | + source /tmp/functions.sh + verify_shutdown working-directory: /home/runner/work/pgagroal/pgagroal/build/src/ - - build-macos: - + runs-on: macos-latest - + steps: - uses: actions/checkout@v3 - - name: Install Homebrew - run: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" - name: Update system run: brew update - name: Install openssl run: brew install openssl - - name: Install libev - run: brew install libev - name: Install zstd run: brew install zstd - name: Install lz4 @@ -135,10 +210,42 @@ jobs: run: | latest_pg=$(brew search postgresql | grep postgresql@ | tail -n 1) brew install ${latest_pg} || true # `|| true` prevents install errors from breaking the run - - name: Start postgres + - name: Start PostgreSQL & setup test table run: | installed_pg=$(brew search postgresql | grep postgresql@ | tail -n 1) brew services start ${installed_pg} + sleep 5 + /opt/homebrew/opt/${installed_pg}/bin/pgbench -i -s 1 -h localhost -p 5432 -U $(whoami) -d postgres + exit 0 + - name: Define functions `verify_running` and `verify_shutdown` + run: | + echo 'verify_running() { + echo "[*] Confirming pgagroal is listening on port 2345" + netstat -an | grep "\.2345 .*LISTEN" || (echo "Nothing is listening on port 2345"; exit 1) + echo "[*] Running pgagroal-cli ping" + ./pgagroal-cli ping + echo "[*] Running queries with psql" + installed_pg=$(brew search postgresql | grep postgresql@ | tail -n 1) + PGPASSWORD="postgres" /opt/homebrew/opt/${installed_pg}/bin/psql -h 127.0.0.1 -p 2345 -U $(whoami) -d postgres -c "SELECT * FROM pgbench_accounts LIMIT 50;" > /dev/null + PGPASSWORD="postgres" /opt/homebrew/opt/${installed_pg}/bin/psql -h ::1 -p 2345 -U $(whoami) -d postgres -c "SELECT * FROM pgbench_accounts LIMIT 50;" > /dev/null + } + + verify_shutdown() { + echo "[*] Getting pgid" + pgid=$(ps -o pgid= -p $(cat /tmp/pgagroal.2345.pid) | grep -o "[0-9]*") + echo "[*] Running pgagroal-cli shutdown immediate" + ./pgagroal-cli shutdown immediate + sleep 5 + echo "[*] Confirming there are no dangling pgagroal processes" + if pgrep -g $pgid > /tmp/dangling; then + echo "[E] Dangling pgagroal child processes: + $(cat /tmp/dangling)" + exit 1 + else + echo "Removing PID file" + rm -f /tmp/pgagroal.2345.pid + fi + }' > /tmp/functions.sh - name: GCC/mkdir run: mkdir build working-directory: /Users/runner/work/pgagroal/pgagroal/ @@ -148,49 +255,63 @@ jobs: - name: GCC/make run: make working-directory: /Users/runner/work/pgagroal/pgagroal/build/ - - name: GCC/Run pgagroal & confirm pgagroal is running - run: | - sudo mkdir -p /etc/pgagroal - sudo cp ../../doc/etc/*.conf /etc/pgagroal - ./pgagroal >> /dev/null 2>&1 & - pid=$! - sleep 5 - ./pgagroal-cli ping + - name: GCC/Run pgagroal as daemon with 'kqueue' backend + run: | + sudo mkdir -p /etc/pgagroal + sed -i '' 's/^log_type =.*$/log_type = file/' ../../doc/etc/pgagroal.conf + sed -i '' 's|^log_path =.*$|log_path = /dev/null|' ../../doc/etc/pgagroal.conf + sed -i '' 's/^ev_backend =.*$/ev_backend = kqueue/' ../../doc/etc/pgagroal.conf + cat ../../doc/etc/pgagroal.conf + echo "host all all all trust" > ../../doc/etc/pgagroal_hba.conf + sudo cp ../../doc/etc/*.conf /etc/pgagroal + ./pgagroal -d || exit 1 + sleep 5 working-directory: /Users/runner/work/pgagroal/pgagroal/build/src/ - - name: GCC/Stop pgagroal & postgres + - name: GCC/Run verify_running for 'kqueue' backend run: | - ./pgagroal-cli shutdown - installed_pg=$(brew search postgresql | grep postgresql@ | tail -n 1) - brew services stop ${installed_pg} + source /tmp/functions.sh + verify_running working-directory: /Users/runner/work/pgagroal/pgagroal/build/src/ - - name: rm -Rf - run: rm -Rf build/ - working-directory: /Users/runner/work/pgagroal/pgagroal/ - - name: Start postgres + - name: GCC/Run verify_shutdown for 'kqueue' backend run: | - installed_pg=$(brew search postgresql | grep postgresql@ | tail -n 1) - brew services start ${installed_pg} + source /tmp/functions.sh + verify_shutdown + working-directory: /Users/runner/work/pgagroal/pgagroal/build/src/ + - name: GCC/Clean up build + run: rm -Rf build + working-directory: /Users/runner/work/pgagroal/pgagroal/ - name: CLANG/mkdir run: mkdir build working-directory: /Users/runner/work/pgagroal/pgagroal/ - name: CLANG/cmake - run: export CC=/usr/bin/clang && export OPENSSL_ROOT_DIR=`brew --prefix openssl` && cmake -DCMAKE_BUILD_TYPE=Debug .. + run: | + export CC=/usr/bin/clang + export OPENSSL_ROOT_DIR=$(brew --prefix openssl) + cmake -DCMAKE_BUILD_TYPE=Debug .. working-directory: /Users/runner/work/pgagroal/pgagroal/build/ - name: CLANG/make run: make working-directory: /Users/runner/work/pgagroal/pgagroal/build/ - - name: CLANG/Run pgagroal & confirm pgagroal is running + - name: CLANG/Run pgagroal as daemon with 'kqueue' backend run: | sudo mkdir -p /etc/pgagroal + sed -i '' 's/^log_type =.*$/log_type = file/' ../../doc/etc/pgagroal.conf + sed -i '' 's|^log_path =.*$|log_path = /dev/null|' ../../doc/etc/pgagroal.conf + sed -i '' 's/^ev_backend =.*$/ev_backend = kqueue/' ../../doc/etc/pgagroal.conf + cat ../../doc/etc/pgagroal.conf + echo "host all all all trust" > ../../doc/etc/pgagroal_hba.conf sudo cp ../../doc/etc/*.conf /etc/pgagroal - ./pgagroal >> /dev/null 2>&1 & - pid=$! + ./pgagroal -d || exit 1 sleep 5 - ./pgagroal-cli ping working-directory: /Users/runner/work/pgagroal/pgagroal/build/src/ - - name: CLANG/Stop pgagroal & postgres + - name: CLANG/Run verify_running for 'kqueue' backend run: | - ./pgagroal-cli shutdown - installed_pg=$(brew search postgresql | grep postgresql@ | tail -n 1) - brew services stop ${installed_pg} + source /tmp/functions.sh + verify_running working-directory: /Users/runner/work/pgagroal/pgagroal/build/src/ + - name: CLANG/Run verify_shutdown for 'kqueue' backend + run: | + source /tmp/functions.sh + verify_shutdown + working-directory: /Users/runner/work/pgagroal/pgagroal/build/src/ + diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f7eb03e..4c58279a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,7 @@ set(generation TRUE) include(CheckCCompilerFlag) include(CheckCSourceCompiles) include(CheckLinkerFlag) +include(CheckFunctionExists) include(FindPackageHandleStandardArgs) include(GNUInstallDirs) @@ -51,7 +52,7 @@ set(SUPPORTED_COMPILERS "GNU" "Clang" "AppleClang") # Check for a supported compiler if (NOT CMAKE_C_COMPILER_ID IN_LIST SUPPORTED_COMPILERS) - message(FATAL_ERROR "Unsupported compiler ${CMAKE_C_COMPILER_ID}. Supported compilers are: ${SUPPORTED_COMPILERS}") + message(FATAL_ERROR "Unsupported compiler ${CMAKE_C_COMPILER_ID}. Supported compilers are: ${SUPPORTED_COMPILERS}") endif () CHECK_C_COMPILER_FLAG("-std=c17" COMPILER_SUPPORTS_C17) @@ -87,13 +88,6 @@ else () message(FATAL_ERROR "lz4 needed") endif() -find_package(Libev 4.11) -if (LIBEV_FOUND) - message(STATUS "libev found") -else () - message(FATAL_ERROR "libev needed") -endif() - find_package(OpenSSL) if (OPENSSL_FOUND) message(STATUS "OpenSSL found") @@ -146,12 +140,21 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") message(FATAL_ERROR "libatomic needed") endif() + find_package(Liburing 2.5) + if (LIBURING_FOUND) + message(STATUS "liburing found") + else() + message(FATAL_ERROR "liburing needed") + endif() + find_package(Systemd) if (SYSTEMD_FOUND) message(STATUS "systemd found") else () message(FATAL_ERROR "systemd needed") endif() + + check_function_exists(epoll_pwait2 HAVE_EPOLL_PWAIT2) endif() file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/src/") diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 22e74785..9899b82b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,7 +34,7 @@ Don't forget to indicate your pgagroal version. You can use the follow command, if you are using a [Fedora](https://getfedora.org/) based platform: ``` -dnf install git gcc cmake make libev libev-devel openssl openssl-devel systemd systemd-devel python3-docutils +dnf install git gcc cmake make openssl openssl-devel systemd systemd-devel python3-docutils liburing liburing-devel ``` in order to get the necessary dependencies. diff --git a/README.md b/README.md index 68661b96..16c5cd99 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ See [Performance](./doc/PERFORMANCE.md) for a performance run. * Process model * Shared memory model across processes -* [libev](http://software.schmorp.de/pkg/libev.html) for fast network interactions +* [liburing](https://github.com/axboe/liburing) for fast network interactions * [Atomic operations](https://en.cppreference.com/w/c/atomic) are used to keep track of state * The [PostgreSQL](https://www.postgresql.org) native protocol [v3](https://www.postgresql.org/docs/11/protocol-message-formats.html) for its communication @@ -63,7 +63,7 @@ after having installed all the required dependencies: * [gcc 8+](https://gcc.gnu.org) (C17) or [clang 8+](https://clang.llvm.org/) * [cmake](https://cmake.org) * [GNU make](https://www.gnu.org/software/make/) or BSD `make` -* [libev](http://software.schmorp.de/pkg/libev.html) +* [liburing](https://github.com/axboe/liburing) * [OpenSSL](http://www.openssl.org/) * [rst2man](https://docutils.sourceforge.io/) * [libatomic](https://gcc.gnu.org/wiki/Atomic) @@ -77,6 +77,18 @@ after having installed all the required dependencies: See the [documentation about installing the required dependencies](doc/DISTRIBUTIONS.md). +```sh +dnf install git gcc cmake make \ + liburing liburing-devel \ + openssl openssl-devel \ + systemd systemd-devel \ + python3-docutils \ + libatomic \ + zlib zlib-devel \ + libzstd libzstd-devel \ + lz4 lz4-devel \ + bzip2 bzip2-devel +``` ### Release build diff --git a/cmake/FindLibev.cmake b/cmake/FindLibev.cmake deleted file mode 100644 index 71e45082..00000000 --- a/cmake/FindLibev.cmake +++ /dev/null @@ -1,38 +0,0 @@ -# - Try to find libev -# Once done this will define -# LIBEV_FOUND - System has libev -# LIBEV_INCLUDE_DIRS - The libev include directories -# LIBEV_LIBRARIES - The libraries needed to use libev - -find_path(LIBEV_INCLUDE_DIR - NAMES ev.h -) -find_library(LIBEV_LIBRARY - NAMES ev -) - -if(LIBEV_INCLUDE_DIR) - file(STRINGS "${LIBEV_INCLUDE_DIR}/ev.h" - LIBEV_VERSION_MAJOR REGEX "^#define[ \t]+EV_VERSION_MAJOR[ \t]+[0-9]+") - file(STRINGS "${LIBEV_INCLUDE_DIR}/ev.h" - LIBEV_VERSION_MINOR REGEX "^#define[ \t]+EV_VERSION_MINOR[ \t]+[0-9]+") - string(REGEX REPLACE "[^0-9]+" "" LIBEV_VERSION_MAJOR "${LIBEV_VERSION_MAJOR}") - string(REGEX REPLACE "[^0-9]+" "" LIBEV_VERSION_MINOR "${LIBEV_VERSION_MINOR}") - set(LIBEV_VERSION "${LIBEV_VERSION_MAJOR}.${LIBEV_VERSION_MINOR}") - unset(LIBEV_VERSION_MINOR) - unset(LIBEV_VERSION_MAJOR) -endif() - -include(FindPackageHandleStandardArgs) -# handle the QUIETLY and REQUIRED arguments and set LIBEV_FOUND to TRUE -# if all listed variables are TRUE and the requested version matches. -find_package_handle_standard_args(Libev REQUIRED_VARS - LIBEV_LIBRARY LIBEV_INCLUDE_DIR - VERSION_VAR LIBEV_VERSION) - -if(LIBEV_FOUND) - set(LIBEV_LIBRARIES ${LIBEV_LIBRARY}) - set(LIBEV_INCLUDE_DIRS ${LIBEV_INCLUDE_DIR}) -endif() - -mark_as_advanced(LIBEV_INCLUDE_DIR LIBEV_LIBRARY) diff --git a/cmake/FindLiburing.cmake b/cmake/FindLiburing.cmake new file mode 100644 index 00000000..7bbf3b13 --- /dev/null +++ b/cmake/FindLiburing.cmake @@ -0,0 +1,18 @@ +# - Try to find liburing +# Once done this will define +# LIBURING_FOUND - System has liburing +# LIBURING_LIBRARY - The library needed to use liburing + +FIND_LIBRARY(LIBURING_LIBRARY NAMES liburing liburing.a liburing.so liburing.so.2 + HINTS + /usr/lib64 + /usr/lib + /lib64 + /lib +) + +IF (LIBURING_LIBRARY) + SET(LIBURING_FOUND TRUE) +ELSE () + SET(LIBURING_FOUND FALSE) +ENDIF () diff --git a/doc/ARCHITECTURE.md b/doc/ARCHITECTURE.md index e1a8aee7..6f21bf1d 100644 --- a/doc/ARCHITECTURE.md +++ b/doc/ARCHITECTURE.md @@ -125,10 +125,18 @@ AuthenticationSASLFinal and AuthenticationOk. The SSLRequest message is supporte The remote management interface is defined in [remote.h](../src/include/remote.h) ([remote.c](../src/libpgagroal/remote.c)). -## libev usage +## I/O layer -[libev](http://software.schmorp.de/pkg/libev.html) is used to handle network interactions, which is "activated" -upon an `EV_READ` event. +The I/O layer interface is primarily defined in [ev.h](../src/include/ev.h) (and implemented in [ev.c](../src/libpgagroal/ev.c)). + +These files contain the definition and implementation of the event loop for the three supported backends: +io_uring, epoll, and kqueue. + +The backend is defined during runtime and can be set with the configuration option `ev_backend`. +Default is `auto`, which will select the first supported backend, considering the following order: +io_uring, epoll, kqueue. + +[liburing](https://github.com/axboe/liburing) was used for setup and usage io_uring instances. Each process has its own event loop, such that the process only gets notified when data related only to that process is ready. The main loop handles the system wide "services" such as idle timeout checks and so on. @@ -172,7 +180,7 @@ The functions `start`, `client`, `server` and `stop` has access to the following ```C struct worker_io { - struct ev_io io; /* The libev base type */ + struct ev_io io; /* The base type for io operations */ int client_fd; /* The client descriptor */ int server_fd; /* The server descriptor */ int slot; /* The slot */ @@ -260,7 +268,7 @@ The `SIGHUP` signal will trigger a reload of the configuration. However, some configuration settings requires a full restart of [**pgagroal**](https://github.com/agroal/pgagroal) in order to take effect. These are * `hugepage` -* `libev` +* `ev_backend` * `log_path` * `log_type` * `max_connections` diff --git a/doc/CONFIGURATION.md b/doc/CONFIGURATION.md index d9c0f262..4ba19994 100644 --- a/doc/CONFIGURATION.md +++ b/doc/CONFIGURATION.md @@ -78,7 +78,7 @@ The available keys and their accepted values are reported in the table below. | tls_cert_file | | String | No | Certificate file for TLS. This file must be owned by either the user running pgagroal or root. | | tls_key_file | | String | No | Private key file for TLS. This file must be owned by either the user running pgagroal or root. Additionally permissions must be at least `0640` when owned by root or `0600` otherwise. | | tls_ca_file | | String | No | Certificate Authority (CA) file for TLS. This file must be owned by either the user running pgagroal or root. | -| libev | `auto` | String | No | Select the [libev](http://software.schmorp.de/pkg/libev.html) backend to use. Valid options: `auto`, `select`, `poll`, `epoll`, `iouring`, `devpoll` and `port` | +| ev_backend | `auto` | String | No | Select the event handling backend to use (`auto`, `io_uring`, `epoll`, and `kqueue`) | | keep_alive | on | Bool | No | Have `SO_KEEPALIVE` on sockets | | nodelay | on | Bool | No | Have `TCP_NODELAY` on sockets | | non_blocking | off | Bool | No | Have `O_NONBLOCK` on sockets | diff --git a/doc/DEVELOPERS.md b/doc/DEVELOPERS.md index 2a4e18a1..10e5c4c6 100644 --- a/doc/DEVELOPERS.md +++ b/doc/DEVELOPERS.md @@ -17,7 +17,7 @@ dnf install postgresql-server #### Basic dependencies ``` sh -dnf install git gcc cmake make libev libev-devel openssl openssl-devel systemd systemd-devel python3-docutils libatomic zlib zlib-devel libzstd libzstd-devel lz4 lz4-devel bzip2 bzip2-devel +dnf install git gcc cmake make liburing liburing-devel openssl openssl-devel systemd systemd-devel python3-docutils libatomic zlib zlib-devel libzstd libzstd-devel lz4 lz4-devel bzip2 bzip2-devel ``` #### Generate user and developer guide diff --git a/doc/etc/pgagroal.conf b/doc/etc/pgagroal.conf index caa84237..c6226fba 100644 --- a/doc/etc/pgagroal.conf +++ b/doc/etc/pgagroal.conf @@ -10,6 +10,7 @@ max_connections = 100 idle_timeout = 600 validation = off unix_socket_dir = /tmp/ +ev_backend = auto [primary] host = localhost diff --git a/doc/man/pgagroal.conf.5.rst b/doc/man/pgagroal.conf.5.rst index 2a24bbda..798b4ba4 100644 --- a/doc/man/pgagroal.conf.5.rst +++ b/doc/man/pgagroal.conf.5.rst @@ -146,8 +146,8 @@ tls_key_file tls_ca_file Certificate Authority (CA) file for TLS. Changes require restart in the server section. -libev - The libev backend to use. Valid options: auto, select, poll, epoll, iouring, devpoll and port. Default is auto +ev_backend + The event handling backend to use. Valid options are auto, io_uring, epoll, and kqueue. Default is auto keep_alive Have SO_KEEPALIVE on sockets. Default is on diff --git a/doc/manual/02-installation.md b/doc/manual/02-installation.md index 99c15c2d..00382862 100644 --- a/doc/manual/02-installation.md +++ b/doc/manual/02-installation.md @@ -41,7 +41,7 @@ We recommend using Fedora to test and run [**pgagroal**][pgagroal], but other Li * [gcc 8+](https://gcc.gnu.org) (C17) * [cmake](https://cmake.org) * [make](https://www.gnu.org/software/make/) -* [libev](http://software.schmorp.de/pkg/libev.html) +* [liburing](https://github.com/axboe/liburing) * [OpenSSL](http://www.openssl.org/) * [systemd](https://www.freedesktop.org/wiki/Software/systemd/) * [rst2man](https://docutils.sourceforge.io/) @@ -52,7 +52,7 @@ We recommend using Fedora to test and run [**pgagroal**][pgagroal], but other Li * [bzip2](http://sourceware.org/bzip2/) ```sh -dnf install git gcc cmake make libev libev-devel \ +dnf install git gcc cmake make liburing liburing-devel \ openssl openssl-devel \ systemd systemd-devel \ python3-docutils libatomic \ @@ -107,7 +107,7 @@ On FreeBSD, `pkg` is used instead of `dnf` or `yum`. Use `pkg install ` to install the following packages ``` sh -git gcc cmake libev openssl libssh py39-docutils +git gcc cmake openssl libssh py39-docutils ``` ### Build diff --git a/doc/manual/99-references.md b/doc/manual/99-references.md index 69154256..f81c0096 100644 --- a/doc/manual/99-references.md +++ b/doc/manual/99-references.md @@ -10,7 +10,7 @@ [gcc]: https://gcc.gnu.org [cmake]: https://cmake.org [make]: https://www.gnu.org/software/make/ - [libev]: http://software.schmorp.de/pkg/libev.html + [liburing]: https://github.com/axboe/liburing [openssl]: http://www.openssl.org/ [systemd]: https://www.freedesktop.org/wiki/Software/systemd/ [rst2man]: https://docutils.sourceforge.io/ diff --git a/doc/manual/dev-02-architecture.md b/doc/manual/dev-02-architecture.md index 5c672111..851801ce 100644 --- a/doc/manual/dev-02-architecture.md +++ b/doc/manual/dev-02-architecture.md @@ -127,10 +127,14 @@ AuthenticationSASLFinal and AuthenticationOk. The SSLRequest message is supporte The remote management interface is defined in [remote.h](../src/include/remote.h) ([remote.c](../src/libpgagroal/remote.c)). -## libev usage +## I/O Layer -[libev](http://software.schmorp.de/pkg/libev.html) is used to handle network interactions, which is "activated" -upon an `EV_READ` event. +The I/O layer interface is primarily defined in [ev.h](../src/include/ev.h) (and implemented in [ev.c](../src/libpgagroal/ev.c)). + +These files contain the definition and implementation of the event loop for the three supported backends: +`io_uring`, `epoll`, and `kqueue`. + +[liburing](https://github.com/axboe/liburing) was used for setup and usage io_uring instances. Each process has its own event loop, such that the process only gets notified when data related only to that process is ready. The main loop handles the system wide "services" such as idle timeout checks and so on. @@ -174,7 +178,7 @@ The functions `start`, `client`, `server` and `stop` has access to the following ```C struct worker_io { - struct ev_io io; /* The libev base type */ + struct ev_io io; /* The base type for io operations */ int client_fd; /* The client descriptor */ int server_fd; /* The server descriptor */ int slot; /* The slot */ @@ -262,7 +266,6 @@ The `SIGHUP` signal will trigger a reload of the configuration. However, some configuration settings requires a full restart of [**pgagroal**](https://github.com/agroal/pgagroal) in order to take effect. These are * `hugepage` -* `libev` * `log_path` * `log_type` * `max_connections` diff --git a/doc/manual/user-02-configuration.md b/doc/manual/user-02-configuration.md index ae2675dd..bdad4c5d 100644 --- a/doc/manual/user-02-configuration.md +++ b/doc/manual/user-02-configuration.md @@ -60,7 +60,7 @@ The available keys and their accepted values are reported in the table below. | tls_cert_file | | String | No | Certificate file for TLS. This file must be owned by either the user running pgagroal or root. | | tls_key_file | | String | No | Private key file for TLS. This file must be owned by either the user running pgagroal or root. Additionally permissions must be at least `0640` when owned by root or `0600` otherwise. | | tls_ca_file | | String | No | Certificate Authority (CA) file for TLS. This file must be owned by either the user running pgagroal or root. | -| libev | `auto` | String | No | Select the [libev](http://software.schmorp.de/pkg/libev.html) backend to use. Valid options: `auto`, `select`, `poll`, `epoll`, `iouring`, `devpoll` and `port` | +| ev_backend | `auto` | String | No | Select the event handling backend to use (`auto`, `io_uring`, `epoll`, and `kqueue`) | | keep_alive | on | Bool | No | Have `SO_KEEPALIVE` on sockets | | nodelay | on | Bool | No | Have `TCP_NODELAY` on sockets | | non_blocking | off | Bool | No | Have `O_NONBLOCK` on sockets | diff --git a/pgagroal.spec b/pgagroal.spec index b393a36f..2e207bdc 100644 --- a/pgagroal.spec +++ b/pgagroal.spec @@ -7,8 +7,8 @@ URL: https://github.com/agroal/pgagroal Source0: https://github.com/agroal/pgagroal/archive/%{version}.tar.gz BuildRequires: gcc cmake make python3-docutils -BuildRequires: libev libev-devel openssl openssl-devel systemd systemd-devel libatomic zlib zlib-devel libzstd libzstd-devel lz4 lz4-devel bzip2 bzip2-devel -Requires: libev openssl systemd libatomic zlib libzstd lz4 bzip2 +BuildRequires: liburing libev-devel openssl openssl-devel systemd systemd-devel libatomic zlib zlib-devel libzstd libzstd-devel lz4 lz4-devel bzip2 bzip2-devel +Requires: liburing openssl systemd libatomic zlib libzstd lz4 bzip2 %description pgagroal is a high-performance connection pool for PostgreSQL. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cdccba5d..ecf63434 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,7 +19,6 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") # include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/include - ${LIBEV_INCLUDE_DIRS} ${OPENSSL_INCLUDE_DIR} ${SYSTEMD_INCLUDE_DIRS} ${THREAD_INCLUDE_DIRS} @@ -33,7 +32,6 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") # Library directories # link_libraries( - ${LIBEV_LIBRARIES} ${OPENSSL_CRYPTO_LIBRARY} ${OPENSSL_SSL_LIBRARY} ${SYSTEMD_LIBRARIES} @@ -45,6 +43,16 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") ${LZ4_LIBRARIES} ) + # + # Event library backend + # + if (NOT LIBURING_FOUND) + message(FATAL_ERROR "liburing was not found") + endif() + + include_directories(${LIBURING_INCLUDE_DIRS}) + link_libraries(${LIBURING_LIBRARY}) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-undefined") elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") find_program(HOMEBREW_EXECUTABLE brew) @@ -77,7 +85,6 @@ elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") # include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/include - ${LIBEV_INCLUDE_DIRS} ${OPENSSL_INCLUDE_DIRS} ${THREAD_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS} @@ -90,7 +97,6 @@ elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") # Library directories # link_libraries( - ${LIBEV_LIBRARIES} ${OPENSSL_LIBRARIES} ${THREAD_LIBRARY} ${ZLIB_LIBRARIES} @@ -116,7 +122,6 @@ else() # include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/include - ${LIBEV_INCLUDE_DIRS} ${OPENSSL_INCLUDE_DIRS} ${THREAD_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS} @@ -129,7 +134,6 @@ else() # Library directories # link_libraries( - ${LIBEV_LIBRARIES} ${OPENSSL_LIBRARIES} ${THREAD_LIBRARY} ${ZLIB_LIBRARIES} @@ -242,6 +246,7 @@ if (CMAKE_BUILD_TYPE MATCHES Debug) if (HAS_NO_OMIT_FRAME_POINTER) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer") endif() + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS_DEBUG} -g") endif() if (CMAKE_BUILD_TYPE MATCHES Release OR CMAKE_BUILD_TYPE MATCHES RelWithDebInfo) diff --git a/src/bidi.c b/src/bidi.c new file mode 100644 index 00000000..4ea2d5e5 --- /dev/null +++ b/src/bidi.c @@ -0,0 +1,2172 @@ +/* + * Copyright (C) 2024 The pgagroal community + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may + * be used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* pgagroal */ +#include +#include +#include +#include + +/* system */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_URING +#include +#include +#include +#endif + +#if HAVE_EPOLL +#include +#include +#endif + +#if HAVE_KQUEUE +#include +#include +#include +#endif + +#define TYPEOF(watcher) watcher->io->type + +#define pr_dbg(s) do { printf(s); fflush(stdout); } while (0) +#define SET_ERR(watcher, err) watcher->errcode = err; +#define CLEAN_ERR(watcher) watcher->errcode = 0; + +#define for_each(w, first) for (w = first; w; w = w->next) + +#define list_add(w, first) \ + do { \ + w->next = first; \ + first = w; \ + } while (0) \ + +#define list_delete(w, first, target, ret) \ + do { \ + for (w = first; *w && *w != target; w = &(*w)->next); \ + if (!(*w)) { \ + pgagroal_log_warn("%s: target watcher not found\n", __func__); \ + ret = EV_ERROR; \ + } else { \ + if (!target->next) { \ + *w = NULL; \ + } else { \ + *w = target->next; \ + } \ + } \ + } while (0) \ + +static int (*loop_init)(struct ev_loop*); +static int (*loop_fork)(struct ev_loop**); +static int (*loop_destroy)(struct ev_loop*); +static int (*loop_start)(struct ev_loop*); +static void (*loop_break)(struct ev_loop*); + +static int (*io_start)(struct ev_loop*, struct ev_io*); +static int (*io_stop)(struct ev_loop*, struct ev_io*); +static int io_init(struct ev_io* w, int fd, int event, io_cb cb, void* data, int size, int slot); + +static int (*signal_start)(struct ev_loop*, struct ev_signal*); +static int (*signal_stop)(struct ev_loop*, struct ev_signal*); + +static int (*periodic_init)(struct ev_periodic*, int); +static int (*periodic_start)(struct ev_loop*, struct ev_periodic*); +static int (*periodic_stop)(struct ev_loop*, struct ev_periodic*); + +static bool (*is_running)(struct ev_loop* ev); +static void (*set_running)(struct ev_loop* ev); + +static int setup_ops(struct ev_loop*); +static int setup_context(struct ev_context*); + +#if HAVE_URING +static int __io_uring_init(struct ev_loop*); +static int __io_uring_destroy(struct ev_loop*); +static int __io_uring_handler(struct ev_loop*, struct io_uring_cqe*); +static int __io_uring_loop(struct ev_loop*); +static int __io_uring_fork(struct ev_loop**); +static int __io_uring_io_start(struct ev_loop*, struct ev_io*); +static int __io_uring_io_stop(struct ev_loop*, struct ev_io*); +static int __io_uring_setup_buffers(struct ev_loop*); +static int __io_uring_periodic_init(struct ev_periodic* w, int msec); +static int __io_uring_periodic_start(struct ev_loop* loop, struct ev_periodic* w); +static int __io_uring_periodic_stop(struct ev_loop* loop, struct ev_periodic* w); +static int __io_uring_signal_handler(struct ev_loop* ev, int signum); +static int __io_uring_signal_start(struct ev_loop* ev, struct ev_signal* w); +static int __io_uring_signal_stop(struct ev_loop* ev, struct ev_signal* w); +static int __io_uring_receive_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe, + bool is_proxy); +static int __io_uring_send_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe); +static int __io_uring_accept_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe); +static int __io_uring_periodic_handler(struct ev_loop* ev, struct ev_periodic* w); +static int __io_uring_bidi_send_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe); +static int __io_uring_bidi_receive_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe); + +#endif + +#if HAVE_EPOLL +static int __epoll_init(struct ev_loop*); +static int __epoll_destroy(struct ev_loop*); +static int __epoll_handler(struct ev_loop*, void*); +static int __epoll_loop(struct ev_loop*); +static int __epoll_fork(struct ev_loop**); +static int __epoll_io_start(struct ev_loop*, struct ev_io*); +static int __epoll_io_stop(struct ev_loop*, struct ev_io*); +static int __epoll_io_handler(struct ev_loop*, struct ev_io*); +static int __epoll_send_handler(struct ev_loop*, struct ev_io*); +static int __epoll_accept_handler(struct ev_loop*, struct ev_io*); +static int __epoll_receive_handler(struct ev_loop*, struct ev_io*); +static int __epoll_periodic_init(struct ev_periodic*, int); +static int __epoll_periodic_start(struct ev_loop*, struct ev_periodic*); +static int __epoll_periodic_stop(struct ev_loop*, struct ev_periodic*); +static int __epoll_periodic_handler(struct ev_loop*, struct ev_periodic*); +static int __epoll_signal_stop(struct ev_loop*, struct ev_signal*); +static int __epoll_signal_handler(struct ev_loop*); +static int __epoll_signal_start(struct ev_loop*, struct ev_signal*); + +#endif + +#if HAVE_KQUEUE +static int __kqueue_init(struct ev_loop*); +static int __kqueue_destroy(struct ev_loop*); +static int __kqueue_handler(struct ev_loop*, struct kevent*); +static int __kqueue_loop(struct ev_loop*); +static int __kqueue_fork(struct ev_loop**); +static int __kqueue_io_start(struct ev_loop*, struct ev_io*); +static int __kqueue_io_stop(struct ev_loop*, struct ev_io*); +static int __kqueue_io_handler(struct ev_loop*, struct kevent*); +static int __kqueue_send_handler(struct ev_loop*, struct ev_io*); +static int __kqueue_accept_handler(struct ev_loop*, struct ev_io*); +static int __kqueue_receive_handler(struct ev_loop*, struct ev_io*); +static int __kqueue_periodic_init(struct ev_periodic*, int); +static int __kqueue_periodic_start(struct ev_loop*, struct ev_periodic*); +static int __kqueue_periodic_stop(struct ev_loop*, struct ev_periodic*); +static int __kqueue_periodic_handler(struct ev_loop*, struct kevent*); +static int __kqueue_signal_stop(struct ev_loop*, struct ev_signal*); +static int __kqueue_signal_handler(struct ev_loop*, struct kevent*); +static int __kqueue_signal_start(struct ev_loop*, struct ev_signal*); +#endif + +inline static int __attribute__((unused)) +set_non_blocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) + { + return EV_ERROR; + } + return fcntl(fd, F_SETFL, flags | O_NONBLOCK); +} + +inline static bool __attribute__((unused)) +is_non_blocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + return (flags & O_NONBLOCK); +} + +static inline bool +__is_running(struct ev_loop* ev) +{ + return ev->running; +} +static inline bool +__is_running_atomic(struct ev_loop* ev) +{ + return atomic_load(&ev->atomic_running); +} + +static inline void +__set_running(struct ev_loop* ev) +{ + ev->running = true; +} +static inline void +__set_running_atomic(struct ev_loop* ev) +{ + atomic_store(&ev->atomic_running, true); +} + +static inline void +__break(struct ev_loop* loop) +{ + loop->running = false; +} +static inline void +__break_atomic(struct ev_loop* loop) +{ + atomic_store(&loop->atomic_running, false); +} + +inline static bool +__io_uring_enabled(void) +{ + int fd; + char res; + fd = open("/proc/sys/kernel/io_uring_disabled", O_RDONLY); + if (fd < 0) + { + if (errno == ENOENT) + { + return true; + } + pgagroal_log_fatal("Failed to open file /proc/sys/kernel/io_uring_disabled: %s", strerror(errno)); + exit(1); + } + if (read(fd, &res, 1) <= 0) + { + pgagroal_log_fatal("Failed to read file /proc/sys/kernel/io_uring_disabled"); + exit(1); + } + if (close(fd) < 0) + { + pgagroal_log_fatal("Failed to close file descriptor for /proc/sys/kernel/io_uring_disabled: %s", strerror(errno)); + exit(1); + } + + return res == '0'; +} + +struct ev_loop* +pgagroal_ev_init(struct configuration* config) +{ + int ret = EV_OK; + struct ev_loop* ev = calloc(1, sizeof(struct ev_loop)); + + if (!config) + { + struct configuration default_config = { 0 }; + strcpy(default_config.ev_backend, FALLBACK_BACKEND); + if (!config) + { + config = &default_config; + } + } + ev->config = config; + + ret = setup_context(&ev->ctx); + if (ret) + { + pgagroal_log_error("ev_backend: context setup error\n"); + goto error; + } + + /* dummy heads */ + + ev->ihead.slot = -1; + ev->ihead.next = NULL; + ev->shead.slot = -1; + ev->shead.next = NULL; + ev->phead.slot = -1; + ev->phead.next = NULL; + + ret = setup_ops(ev); + if (ret) + { + pgagroal_log_error("setup_ops: setup error\n"); + goto error; + } + + /* init */ + + sigemptyset(&ev->sigset); + + ret = loop_init(ev); + if (ret) + { + pgagroal_log_error("loop_init error"); + goto error; + } + return ev; + +error: + free(ev); + return NULL; +} + +int +pgagroal_ev_loop(struct ev_loop* loop) +{ + return loop_start(loop); +} + +int +pgagroal_ev_loop_fork(struct ev_loop** loop) +{ + return loop_fork(loop); +} + +int +pgagroal_ev_loop_destroy(struct ev_loop* ev) +{ + sigemptyset(&ev->sigset); + return loop_destroy(ev); +} + +void +pgagroal_ev_loop_break(struct ev_loop* ev) +{ + loop_break(ev); +} + +int +pgagroal_ev_io_accept_init(struct ev_io* w, int fd, io_cb cb) +{ + return io_init(w, fd, EV_ACCEPT, cb, NULL, 0, -1); +} + +int +pgagroal_ev_io_bidi_init(struct ev_io* w, int fd_in, io_cb cb, io_cb cb2, int fd_out) +{ + return io_init(w, fd_in, EV_BIDI, cb, cb2, fd_out, -1); +} + +int +pgagroal_ev_io_read_init(struct ev_io* w, int fd, io_cb cb) +{ + return io_init(w, fd, READ, cb, NULL, 0, -1); +} + +int +pgagroal_ev_io_send_init(struct ev_io* w, int fd, io_cb cb, void* buf, int buf_len, int bid) +{ + return io_init(w, fd, EV_SEND, cb, buf, buf_len, bid); +} + +int +pgagroal_ev_io_receive_init(struct ev_io* w, int fd, io_cb cb) +{ + return io_init(w, fd, EV_RECEIVE, cb, NULL, 0, -1); +} + +int +pgagroal_ev_io_connect_init(struct ev_io* w, int fd, io_cb cb, union sockaddr_u* addr) +{ + return io_init(w, fd, CONNECT, cb, (void*)addr, 0, -1); +} + +int +pgagroal_ev_io_start(struct ev_loop* ev, struct ev_io* w) +{ + list_add(w, ev->ihead.next); + return io_start(ev, w); +} + +int +pgagroal_ev_io_stop(struct ev_loop* ev, struct ev_io* target) +{ + int ret = EV_OK; + struct ev_io** w; + if (!target) + { + pgagroal_log_fatal("impossible situation: null pointer provided to stop\n"); + } + io_stop(ev, target); + list_delete(w, &ev->ihead.next, target, ret); + /* pgagroal deals with fd close */ + return ret; +} + +int +pgagroal_ev_signal_init(struct ev_signal* w, signal_cb cb, int signum) +{ + w->type = EV_SIGNAL; + w->signum = signum; + w->cb = cb; + w->slot = -1; + w->next = NULL; + return EV_OK; +} + +int +pgagroal_ev_signal_start(struct ev_loop* ev, struct ev_signal* w) +{ + sigaddset(&ev->sigset, w->signum); + if (sigprocmask(SIG_BLOCK, &ev->sigset, NULL) == -1) + { + pgagroal_log_fatal("sigprocmask"); + exit(1); + } + signal_start(ev, w); + list_add(w, ev->shead.next); + return EV_OK; +} + +int +pgagroal_ev_signal_stop(struct ev_loop* ev, struct ev_signal* target) +{ + int ret = EV_OK; + struct ev_signal** w; + + if (!target) + { + pgagroal_log_error("NULL pointer provided to stop\n"); + return EV_ERROR; + } + + sigdelset(&ev->sigset, target->signum); + + if (pthread_sigmask(SIG_UNBLOCK, &ev->sigset, NULL) == -1) + { + pgagroal_log_error("%s: pthread_sigmask failed\n", __func__); + return EV_ERROR; + } + + signal_stop(ev, target); + + list_delete(w, &ev->shead.next, target, ret); + + return ret; +} + +int +pgagroal_ev_periodic_init(struct ev_periodic* w, periodic_cb cb, int msec) +{ + if (periodic_init(w, msec)) + { + pgagroal_log_fatal("%s: __periodic_init failed", __func__); + } + w->type = EV_PERIODIC; + w->slot = -1; + w->cb = cb; + w->next = NULL; + return EV_OK; +} + +int +pgagroal_ev_periodic_start(struct ev_loop* loop, struct ev_periodic* w) +{ + periodic_start(loop, w); + list_add(w, loop->phead.next); + return EV_OK; +} + +int +pgagroal_ev_periodic_stop(struct ev_loop* ev, struct ev_periodic* target) +{ + int ret = EV_OK; + struct ev_periodic** w; + if (!target) + { + pgagroal_log_error("null pointer provided to stop\n"); + return EV_ERROR; + } + ret = periodic_stop(ev, target); + list_delete(w, &ev->phead.next, target, ret); + return ret; +} + +static int +setup_ops(struct ev_loop* ev) +{ + int ret = EV_OK; + bool mtt = ev->ctx.multithreading; + struct configuration* config = (struct configuration*)shmem; + + is_running = mtt ? __is_running_atomic : __is_running; + set_running = mtt ? __set_running_atomic: __set_running; + loop_break = mtt ? __break_atomic: __break; + + if (!strcmp(config->ev_backend, "io_uring")) + { +#if HAVE_URING + loop_init = __io_uring_init; + loop_fork = __io_uring_fork; + loop_destroy = __io_uring_destroy; + loop_start = __io_uring_loop; + io_start = __io_uring_io_start; + io_stop = __io_uring_io_stop; + periodic_init = __io_uring_periodic_init; + periodic_start = __io_uring_periodic_start; + periodic_stop = __io_uring_periodic_stop; + signal_start = __io_uring_signal_start; + signal_stop = __io_uring_signal_stop; +#endif + } + if (!strcmp(config->ev_backend, "epoll")) + { +#if HAVE_EPOLL + loop_init = __epoll_init; + loop_fork = __epoll_fork; + loop_destroy = __epoll_destroy; + loop_start = __epoll_loop; + io_start = __epoll_io_start; + io_stop = __epoll_io_stop; + periodic_init = __epoll_periodic_init; + periodic_start = __epoll_periodic_start; + periodic_stop = __epoll_periodic_stop; + signal_start = __epoll_signal_start; + signal_stop = __epoll_signal_stop; +#endif + } + else if (!strcmp(config->ev_backend, "kqueue")) + { +#if HAVE_KQUEUE + loop_init = __kqueue_init; + loop_fork = __kqueue_fork; + loop_destroy = __kqueue_destroy; + loop_start = __kqueue_loop; + io_start = __kqueue_io_start; + io_stop = __kqueue_io_stop; + periodic_init = __kqueue_periodic_init; + periodic_start = __kqueue_periodic_start; + periodic_stop = __kqueue_periodic_stop; + signal_start = __kqueue_signal_start; + signal_stop = __kqueue_signal_stop; +#endif + } + + return ret; +} + +/* + * TODO: move this to libpgagroal/configuration.c and allow configuration + */ +static int +setup_context(struct ev_context* ctx) +{ + struct configuration* config = &((struct main_configuration*)shmem)->common; + /* ordered from highest to lowest priority */ + char* backends[] = { +#if HAVE_URING + "io_uring", +#endif +#if HAVE_EPOLL + "epoll", +#endif +#if HAVE_KQUEUE + "kqueue", +#endif + }; + char log[] = ( +#if HAVE_URING + "io_uring, " +#endif +#if HAVE_EPOLL + "epoll, " +#endif +#if HAVE_KQUEUE + "kqueue, " +#endif + ); + + if (sizeof(backends) == 0) + { + pgagroal_log_fatal("no ev_backend supported"); + exit(1); + } + + log[strlen(log) - 2] = '\0'; + pgagroal_log_debug("Available ev backends: %s", log); + + if (!strnlen(config->ev_backend, MISC_LENGTH)) + { + pgagroal_log_warn("ev_backend not set in configuration file"); + pgagroal_log_warn("ev_backend automatically set to: 'auto'"); + strcpy(config->ev_backend, "auto"); + } + + /* if auto, select the first supported backend */ + if (!strcmp(config->ev_backend, "auto")) + { + strcpy(config->ev_backend, backends[0]); + } + + pgagroal_log_debug("Selected backend: '%s'", config->ev_backend); + + if (!strcmp(config->ev_backend, "io_uring")) + { + if (!__io_uring_enabled()) + { + pgagroal_log_warn("io_uring supported but not enabled. Enable io_uring by setting /proc/sys/kernel/io_uring_disabled to '0'"); + pgagroal_log_warn("Fallback configured to 'epoll'"); + strcpy(config->ev_backend, "epoll"); + } + else if (config->tls) + { + pgagroal_log_warn("ev_backend '%s' not supported with tls on"); + pgagroal_log_warn("Fallback configured to 'epoll'"); + strcpy(config->ev_backend, "epoll"); + } + +#if HAVE_URING + if (ctx->defer_tw && ctx->sqpoll) + { + pgagroal_log_fatal("cannot use DEFER_TW and SQPOLL at the same time\n"); + exit(1); + } + + ctx->entries = 32; + ctx->params.cq_entries = 64; + ctx->params.flags = 0; + ctx->params.flags |= IORING_SETUP_SINGLE_ISSUER; + ctx->params.flags |= IORING_SETUP_CLAMP; + ctx->params.flags |= IORING_SETUP_CQSIZE; + /* ctx->params.flags |= IORING_FEAT_NODROP */ + + /* default configuration */ + + if (ctx->sqpoll) + { + ctx->params.flags |= IORING_SETUP_SQPOLL; + ctx->params.flags ^= IORING_SETUP_DEFER_TASKRUN; + } + if (!ctx->sqpoll && !ctx->defer_tw) + { + ctx->params.flags |= IORING_SETUP_COOP_TASKRUN; + } + if (!ctx->buf_count) + { + ctx->buf_count = BUFFER_COUNT; + } + if (!ctx->buf_size) + { + ctx->buf_size = BUFFER_SIZE; + } + ctx->br_mask = (ctx->buf_count - 1); + + if (ctx->fixed_files) + { + pgagroal_log_fatal("no support for fixed files\n"); /* TODO: add support for fixed files */ + exit(1); + } +#endif + } + else if (!strcmp(config->ev_backend, "epoll")) + { +#if HAVE_EPOLL + ctx->epoll_flags = 0; +#endif + } + + ctx->multithreading = false; + + return EV_OK; +} + +static int +io_init(struct ev_io* w, int fd, int event, io_cb cb, void* data, int size, int slot) +{ + w->fd = fd; + w->type = event; + w->cb = cb; + w->data = data; + w->size = size; + w->slot = slot; + w->bid = -1; + w->errcode = 0; + + if (w->type == EV_BIDI) + { + w->fd_out = size; + w->cb2 = (io_cb) data; + } + return EV_OK; +} + +#if HAVE_URING +static inline struct io_uring_sqe* +__io_uring_get_sqe(struct ev_loop* ev) +{ + struct io_uring* ring = &ev->ring; + struct io_uring_sqe* sqe; + do /* necessary if SQPOLL, but I don't think there is an advantage of using SQPOLL */ + { + sqe = io_uring_get_sqe(ring); + if (sqe) + { + return sqe; + } + else + { + io_uring_sqring_wait(ring); + } + } + while (1); +} + +static inline int +__io_uring_rearm_receive(struct ev_loop* ev, struct ev_io* w) +{ + struct io_uring_sqe* sqe = __io_uring_get_sqe(ev); + io_uring_prep_recv_multishot(sqe, w->fd, NULL, 0, 0); + io_uring_sqe_set_data(sqe, w); + sqe->flags |= IOSQE_BUFFER_SELECT; + sqe->buf_group = IN_BR_BGID; + return EV_OK; +} + +static inline int +__io_uring_replenish_buffers(struct ev_loop* ev, struct io_buf_ring* br, int bid_start, int bid_end) +{ + int count; + struct ev_context ctx = ev->ctx; + if (bid_end >= bid_start) + { + count = (bid_end - bid_start); + } + else + { + count = (bid_end + ctx.buf_count - bid_start); + } + for (int i = bid_start; i != bid_end; i = (i + 1) & (ctx.buf_count - 1)) + { + io_uring_buf_ring_add(br->br, (void*)br->br->bufs[i].addr, ctx.buf_size, i, ctx.br_mask, 0); + } + io_uring_buf_ring_advance(br->br, count); + return EV_OK; +} + +static int +__io_uring_init(struct ev_loop* loop) +{ + int ret = EV_OK; + ret = io_uring_queue_init_params(loop->ctx.entries, &loop->ring, &loop->ctx.params); /* on fork: gets a new ring */ + if (ret) + { + pgagroal_log_fatal("io_uring_queue_init_params: %s\n", strerror(-ret)); + } + if (!loop->ctx.no_use_buffers) + { + ret = __io_uring_setup_buffers(loop); + if (ret) + { + pgagroal_log_fatal("%s: __io_uring_setup_buffers: %s\n", __func__, strerror(-ret)); + } + } + return ret; +} + +static int +__io_uring_destroy(struct ev_loop* ev) +{ + /* free buffer rings */ + io_uring_free_buf_ring(&ev->ring, ev->in_br.br, ev->ctx.buf_count, ev->in_br.bgid); + ev->in_br.br = NULL; + io_uring_free_buf_ring(&ev->ring, ev->out_br.br, ev->ctx.buf_count, ev->out_br.bgid); + ev->out_br.br = NULL; + if (ev->ctx.use_huge) + { + /* TODO: munmap(cbr->buf, buf_size * nr_bufs); */ + } + else + { + free(ev->in_br.buf); + free(ev->out_br.buf); + } + io_uring_queue_exit(&ev->ring); + free(ev); + return EV_OK; +} + +static int +__io_uring_io_start(struct ev_loop* ev, struct ev_io* w) +{ + int domain; + union sockaddr_u* addr; + struct io_uring_sqe* sqe = __io_uring_get_sqe(ev); + io_uring_sqe_set_data(sqe, w); + switch (w->type) + { + case EV_ACCEPT: + io_uring_prep_multishot_accept(sqe, w->fd, NULL, NULL, 0); + break; + case EV_BIDI: + case EV_RECEIVE: + //printf("%s: ev_receive\n", __func__); fflush(stdout); + io_uring_prep_recv(sqe, w->fd, NULL, 0, 0); + sqe->flags |= IOSQE_BUFFER_SELECT | MSG_WAITALL; + sqe->buf_group = IN_BR_BGID; + break; + case EV_SEND: + //printf("%s: ev_send\n", __func__); fflush(stdout); + io_uring_prep_send(sqe, w->fd, w->data, w->size, 0); /* TODO: flags */ + sqe->buf_group = OUT_BR_BGID; + break; + case CONNECT: + addr = (union sockaddr_u*)w->data; + if (ev->ctx.ipv6) + { + io_uring_prep_connect(sqe, w->fd, (struct sockaddr*) &addr->addr6, sizeof(struct sockaddr_in6)); + } + else + { + io_uring_prep_connect(sqe, w->fd, (struct sockaddr*) &addr->addr4, sizeof(struct sockaddr_in)); + } + break; + case SOCKET: + if (ev->ctx.ipv6) + { + domain = AF_INET6; + } + else + { + domain = AF_INET; + } + io_uring_prep_socket(sqe, domain, SOCK_STREAM, 0, 0); + break; + case READ: /* unused */ + io_uring_prep_read(sqe, w->fd, w->data, w->size, 0); + break; + default: + pgagroal_log_fatal("%s: unknown event type: %d\n", __func__, w->type); + return EV_ERROR; + } + return EV_OK; +} + +static int +__io_uring_io_stop(struct ev_loop* ev, struct ev_io* target) +{ + int ret = EV_OK; + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(&ev->ring); + io_uring_prep_cancel64(sqe, (uint64_t)target, 0); /* TODO: flags? */ + return ret; +} + +static int +__io_uring_signal_start(struct ev_loop* ev, struct ev_signal* w) +{ + return EV_OK; +} + +static int +__io_uring_signal_stop(struct ev_loop* ev, struct ev_signal* w) +{ + return EV_OK; +} + +static int +__io_uring_periodic_init(struct ev_periodic* w, int msec) +{ + /* TODO: how optimized is designated initializers really */ + w->ts = (struct __kernel_timespec) { + .tv_sec = msec / 1000, + .tv_nsec = (msec % 1000) * 1000000 + }; + return EV_OK; +} + +static int +__io_uring_periodic_start(struct ev_loop* loop, struct ev_periodic* w) +{ + struct io_uring_sqe* sqe = io_uring_get_sqe(&loop->ring); + io_uring_sqe_set_data(sqe, w); + io_uring_prep_timeout(sqe, &w->ts, 0, IORING_TIMEOUT_MULTISHOT); + return EV_OK; +} + +static int +__io_uring_periodic_stop(struct ev_loop* loop, struct ev_periodic* w) +{ + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(&loop->ring); + io_uring_prep_cancel64(sqe, (uint64_t)w, 0); /* TODO: flags? */ + return EV_OK; +} + +/* + * Based on: https://git.kernel.dk/cgit/liburing/tree/examples/proxy.c + * (C) 2024 Jens Axboe + */ +static int +__io_uring_loop(struct ev_loop* ev) +{ + int ret; + int signum; + int events; + int to_wait = 1; /* wait for any 1 */ + unsigned int head; + struct io_uring_cqe* cqe; + struct __kernel_timespec* ts; + struct __kernel_timespec idle_ts = { + .tv_sec = 0, + .tv_nsec = 100000000LL + }; + struct timespec timeout = { + .tv_sec = 0, + .tv_nsec = 0 + }; + + set_running(ev); + while (is_running(ev)) + { + ts = &idle_ts; + io_uring_submit_and_wait_timeout(&ev->ring, &cqe, to_wait, ts, NULL); + + /* Good idea to leave here to see what happens */ + if (*ev->ring.cq.koverflow) + { + pgagroal_log_error("io_uring overflow %u\n", *ev->ring.cq.koverflow); + exit(EXIT_FAILURE); + } + if (*ev->ring.sq.kflags & IORING_SQ_CQ_OVERFLOW) + { + pgagroal_log_error("io_uring overflow\n"); + exit(EXIT_FAILURE); + } + + /* Check for signals before iterating over cqes */ + signum = sigtimedwait(&ev->sigset, NULL, &timeout); + if (signum > 0) + { + ret = __io_uring_signal_handler(ev, signum); + + if (ret == EV_ERROR) + { + pgagroal_log_error("Signal handling error\n"); + return EV_ERROR; + } + if (!is_running(ev)) + { + break; + } + } + + events = 0; + io_uring_for_each_cqe(&(ev->ring), head, cqe) + { + /* Currently closing the main connection fd means that pgagroal loop + * will stop, so just return an error to the caller. If the caller + * eventually decides to continue the loop, the + * caller will have to handle this error. + */ + ret = __io_uring_handler(ev, cqe); + if (ret == EV_CLOSE_FD) + { + return EV_CLOSE_FD; + } + if (ret == EV_ERROR) + { + pgagroal_log_error("__io_uring_handler error\n"); + return EV_ERROR; + } + events++; + } + if (events) + { + io_uring_cq_advance(&ev->ring, events); /* batch marking as seen */ + } + + /* TODO: housekeeping ? */ + + } + return EV_OK; +} + +static int +__io_uring_fork(struct ev_loop** loop) +{ + struct ev_loop* tmp = *loop; + *loop = pgagroal_ev_init(tmp->config); + __io_uring_destroy(tmp); + + return EV_OK; +} + +static int +__io_uring_handler(struct ev_loop* ev, struct io_uring_cqe* cqe) +{ + int ret = EV_OK; + ev_watcher w; + w.io = (ev_io*)io_uring_cqe_get_data(cqe); + + /* + * Cancelled requests will trigger the handler, but have NULL data. + */ + if (!w.io) + { + return EV_OK; + } + + /* io handler */ + //printf("%s: entering %d\n", __func__, w.io->type); fflush(stdout); + switch (w.io->type) + { + case EV_PERIODIC: + return __io_uring_periodic_handler(ev, w.periodic); + case EV_ACCEPT: + return __io_uring_accept_handler(ev, w.io, cqe); + case EV_BIDI: + //printf("%s: got to ev_bidi bid=%d\n", __func__, w.io->bid); fflush(stdout); + if (w.io->bid < 0) + { + return __io_uring_bidi_receive_handler(ev, w.io, cqe); + } + else + { + return __io_uring_bidi_send_handler(ev, w.io, cqe); + } + break; + case EV_SEND: + return __io_uring_send_handler(ev, w.io, cqe); + + case EV_RECEIVE: +retry: + ret = __io_uring_receive_handler(ev, w.io, cqe, false); + switch (ret) + { + case EV_CLOSE_FD: /* connection closed */ + /* pgagroal deals with closing fd */ + break; + case EV_REPLENISH_BUFFERS: /* TODO: stress test. Buffers should be replenished after each recv. */ + pgagroal_log_warn("__io_uring_receive_handler: request requeued\n"); + exit(1); + usleep(100); + goto retry; + break; + } + break; + default: + pgagroal_log_fatal("%s: _io_handler: event not found eventno=%d", __func__, w.io->type); + } + return ret; +} + +static int +__io_uring_periodic_handler(struct ev_loop* ev, struct ev_periodic* w) +{ + w->cb(ev, w, 0); + return EV_OK; +} + +static int +__io_uring_bidi_send_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe) +{ + // int revents = EV_OK; + struct io_buf_ring* in_br = &ev->in_br; + // // struct io_buf_ring* out_br = &ev->out_br; + struct ev_context ctx = ev->ctx; + //printf("%s: entering\n", __func__); fflush(stdout); + if (!cqe->res) + { + w->errcode = EV_ERROR; + w->cb2(ev, w, EV_ERROR); + } + // __io_uring_rearm_receive(ev, w); + +#if 1 /* TODO : #if DEBUG */ + assert (w->bid >= 0); +#endif + + io_uring_buf_ring_add(in_br->br, (void*) in_br->br->bufs[w->bid].addr, ctx.buf_size, w->bid, ctx.br_mask, 0); + io_uring_buf_ring_advance(in_br->br, 1); + + w->bid = -1; + + assert (w->bid < 0); + struct io_uring_sqe* sqe = __io_uring_get_sqe(ev); + io_uring_sqe_set_data(sqe, w); + io_uring_prep_recv(sqe, w->fd, NULL, 0, 0); + sqe->flags |= IOSQE_BUFFER_SELECT | MSG_WAITALL; + sqe->buf_group = IN_BR_BGID; + return EV_OK; +} + +static int +__io_uring_bidi_receive_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe) +{ + + int ret = EV_OK; + int bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT; + struct ev_context ctx = ev->ctx; + struct io_buf_ring* in_br = &ev->in_br; + //struct io_buf_ring* out_br = &ev->out_br; + int total_in_bytes; + //printf("%s: entering\n", __func__); fflush(stdout); + + if (cqe->res == -ENOBUFS) + { + pgagroal_log_warn("io_receive_handler: Not enough buffers\n"); + return EV_REPLENISH_BUFFERS; + } + + if (!(cqe->flags & IORING_CQE_F_BUFFER)) + { + if (!(cqe->res)) /* Closed connection */ + { + return EV_CLOSE_FD; + } + } + + /* From the docs: https://man7.org/linux/man-pages/man3/io_uring_prep_recv_multishot.3.html + * "If a posted CQE does not have the IORING_CQE_F_MORE flag set then the multishot receive will be + * done and the application should issue a new request." + */ + // if (!(cqe->flags & IORING_CQE_F_MORE)) + // { + // pgagroal_log_warn("need to rearm receive: added timeout"); + // ret = __io_uring_rearm_receive(ev, w); + // if (ret) + // { + // return EV_ERROR; + // } + // } + + total_in_bytes = cqe->res; + if (total_in_bytes >= MAX_BUFFER_SIZE) + { + pgagroal_log_fatal("unexpected"); + exit(1); + } + + w->data = in_br->buf + (bid * ctx.buf_size); + w->size = total_in_bytes; + w->bid = bid; + w->cb(ev, w, ret); + + struct io_uring_sqe* sqe = __io_uring_get_sqe(ev); + io_uring_sqe_set_data(sqe, w); + // printf("%s: w->data=%s\n", __func__, (char*)w->data); + io_uring_prep_send(sqe, w->fd_out, w->data, w->size, 0); + sqe->buf_group = OUT_BR_BGID; + // io_uring_submit(&ev->ring); + //printf("%s: exiting\n", __func__); fflush(stdout); + + // io_uring_buf_ring_add(in_br->br, (void*)in_br->br->bufs[bid].addr, ctx.buf_size, bid, ctx.br_mask, 0); + // io_uring_buf_ring_advance(in_br->br, 1); + + return EV_OK; +} + +static int +__io_uring_accept_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe) +{ + w->fd_out = cqe->res; + w->cb(ev, w, 0); + return EV_OK; +} + +static int +__io_uring_send_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe) +{ + int revents = EV_OK; + if (!cqe->res) + { + revents = EV_ERROR; + } + w->cb(ev, w, revents); + + return EV_OK; +} + +static int +__io_uring_signal_handler(struct ev_loop* ev, int signum) +{ + struct ev_signal* w; + for (w = ev->shead.next; w && w->signum != signum; w = w->next) + { + /* empty */; + } + if (!w) + { + pgagroal_log_error("no watcher for signal %d\n", signum); + exit(EXIT_FAILURE); + } + w->cb(ev, w, 0); + return EV_OK; +} + +static int +__io_uring_receive_handler(struct ev_loop* ev, struct ev_io* w, struct io_uring_cqe* cqe, bool is_proxy) +{ + int ret = EV_OK; + int bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT; + struct ev_context ctx = ev->ctx; + struct io_buf_ring* in_br = &ev->in_br; + int total_in_bytes; + + if (cqe->res == -ENOBUFS) + { + pgagroal_log_warn("io_receive_handler: Not enough buffers\n"); + return EV_REPLENISH_BUFFERS; + } + + if (!(cqe->flags & IORING_CQE_F_BUFFER)) + { + if (!(cqe->res)) /* Closed connection */ + { + return EV_CLOSE_FD; + } + } + + /* From the docs: https://man7.org/linux/man-pages/man3/io_uring_prep_recv_multishot.3.html + * "If a posted CQE does not have the IORING_CQE_F_MORE flag set then the multishot receive will be + * done and the application should issue a new request." + */ + if (!(cqe->flags & IORING_CQE_F_MORE)) + { + pgagroal_log_warn("need to rearm receive: added timeout"); + ret = __io_uring_rearm_receive(ev, w); + if (ret) + { + return EV_ERROR; + } + } + + total_in_bytes = cqe->res; + if (total_in_bytes >= MAX_BUFFER_SIZE) + { + pgagroal_log_fatal("unexpected"); + exit(1); + } + + /* This is not valid anymore as the buffers can fit anything. + * If the size of the buffer (this_bytes) is greater than the size of the received bytes, then continue. + * Otherwise, we iterate over another buffer. + */ + // in_bytes = cqe->res; + // while (in_bytes) + // { + // buf = &(in_br->br->bufs[bid]); + // data = (char*) buf->addr; + // this_bytes = buf->len; + + /* Break if the received bytes is smaller than buffer length. + * Otherwise, continue iterating over the buffers. */ + // if (this_bytes > in_bytes) + // { + // this_bytes = in_bytes; + // } + + // io_uring_buf_ring_add(out_br->br, data, this_bytes, bid, ctx.br_mask, 0); + // io_uring_buf_ring_advance(out_br->br, 1); + + // in_bytes -= this_bytes; + + // *bid = (*bid + 1) & (ctx.buf_count - 1); + // } + + w->data = in_br->buf + (bid * ctx.buf_size); + w->size = total_in_bytes; + w->bid = bid; + w->cb(ev, w, ret); + + /* return buffer to the pool */ + /* get first available out_br */ + // if (w->type == EV_BIDI) + // { + // io_uring_buf_ring_add(out_br->br, (void*)in_br->br->bufs[bid].addr, ctx.buf_size, bid, ctx.br_mask, 0); + // io_uring_buf_ring_advance(out_br->br, 1); + // } + // io_uring_buf_ring_add(in_br->br, (void*)in_br->br->bufs[bid].addr, ctx.buf_size, bid, ctx.br_mask, 0); + // io_uring_buf_ring_advance(in_br->br, 1); + + return EV_OK; +} + +static int +__io_uring_setup_buffers(struct ev_loop* ev) +{ + int ret = EV_OK; + void* ptr; + struct ev_context ctx = ev->ctx; + + struct io_buf_ring* in_br = &ev->in_br; + struct io_buf_ring* out_br = &ev->out_br; + + if (ctx.use_huge) + { + pgagroal_log_warn("use_huge not implemented yet\n"); /* TODO */ + } + if (posix_memalign(&in_br->buf, ALIGNMENT, ctx.buf_count * ctx.buf_size)) + { + pgagroal_log_fatal("posix_memalign"); + exit(1); + } + + in_br->br = io_uring_setup_buf_ring(&ev->ring, ctx.buf_count, IN_BR_BGID, 0, &ret); + out_br->br = io_uring_setup_buf_ring(&ev->ring, ctx.buf_count, OUT_BR_BGID, 0, &ret); + if (!in_br->br || !out_br->br) + { + pgagroal_log_fatal("buffer ring register failed %d\n", ret); + exit(1); + } + + ptr = in_br->buf; + for (int i = 0; i < ctx.buf_count; i++) + { + io_uring_buf_ring_add(in_br->br, ptr, ctx.buf_size, i, ctx.br_mask, i); + ptr += ctx.buf_size; + } + io_uring_buf_ring_advance(in_br->br, ctx.buf_count); + + // ptr = in_br->buf; + // out_br->available = 0; + // for (int i = 0; i < ctx.buf_count; i++) + // { + // io_uring_buf_ring_add(out_br->br, ptr, ctx.buf_size, i, ctx.br_mask, i); + // ptr += ctx.buf_size; + // } + // io_uring_buf_ring_advance(out_br->br, ctx.buf_count); + + return ret; +} + +void +_next_bid(struct ev_loop* ev, int* bid) +{ + struct ev_context ctx = ev->ctx; + *bid = (*bid + 1) % ctx.buf_count; +} +#endif + +/******************************************************************************** + * * + * EPOLL * + * * + *********************************************************************************/ + +#if HAVE_EPOLL + +int +__epoll_loop(struct ev_loop* loop) +{ + int ret; + int nfds; + struct epoll_event events[MAX_EVENTS]; + int timeout = 10; + struct epoll_event ev = { + .events = EPOLLIN, /* | EPOLLET */ + .data.fd = signalfd(-1, &loop->sigset, 0), + }; + if (ev.data.fd == -1) + { + pgagroal_log_fatal("signalfd"); + exit(1); + } + if (epoll_ctl(loop->epollfd, EPOLL_CTL_ADD, ev.data.fd, &ev) == -1) + { + pgagroal_log_fatal("epoll_ctl (signalfd)"); + exit(1); + } + + set_running(loop); + while (is_running(loop)) + { + /* TODO: see if using epoll_pwait2 is better than current implementation + * + * nfds = epoll_pwait2(loop->epollfd, events, MAX_EVENTS, &timeout, NULL); + */ + nfds = epoll_wait(loop->epollfd, events, MAX_EVENTS, timeout); + + if (!is_running(loop)) + { + break; + } + for (int i = 0; i < nfds; i++) + { + if (events[i].data.fd == ev.data.fd) + { + ret = __epoll_signal_handler(loop); + } + else + { + ret = __epoll_handler(loop, (void*)events[i].data.u64); + /* Currently closing the main connection fd means that pgagroal loop + * will stop, so just return an error to the caller. If the caller + * eventually decides to continue the loop, the + * caller will have to handle this error. + */ + if (ret == EV_CLOSE_FD) + { + return EV_CLOSE_FD; + } + if (ret == EV_ERROR) + { + pgagroal_log_error("handler error"); + return EV_ERROR; + } + } + } + } + return EV_OK; +} + +static int +__epoll_init(struct ev_loop* ev) +{ + ev->buffer = malloc(sizeof(char) * (MAX_BUFFER_SIZE)); + ev->epollfd = epoll_create1(ev->ctx.epoll_flags); + if (ev->epollfd == -1) + { + pgagroal_log_error("epoll init error"); + return EV_ERROR; + } + + return EV_OK; +} + +static int +__epoll_fork(struct ev_loop** parent_loop) +{ + /* TODO destroy everything related to loop */ + if (sigprocmask(SIG_UNBLOCK, &(*parent_loop)->sigset, NULL) == -1) + { + pgagroal_log_fatal("sigprocmask"); + exit(1); + } + sigemptyset(&(*parent_loop)->sigset); + close((*parent_loop)->epollfd); + return EV_OK; +} + +static int +__epoll_destroy(struct ev_loop* ev) +{ + close(ev->epollfd); + free(ev); + return EV_OK; +} + +static int +__epoll_handler(struct ev_loop* ev, void* wp) +{ + struct ev_periodic* w = (struct ev_periodic*)wp; + if (w->type == EV_PERIODIC) + { + return __epoll_periodic_handler(ev, (struct ev_periodic*)w); + } + return __epoll_io_handler(ev, (struct ev_io*)w); +} + +static int +__epoll_signal_start(struct ev_loop* ev, struct ev_signal* w) +{ + + return EV_OK; +} + +static int +__epoll_signal_stop(struct ev_loop* ev, struct ev_signal* w) +{ + return EV_OK; +} + +static int +__epoll_signal_handler(struct ev_loop* ev) +{ + struct ev_signal* w; + siginfo_t siginfo; + int signo; + signo = sigwaitinfo(&ev->sigset, &siginfo); + if (signo == -1) + { + pgagroal_log_error("sigwaitinfo"); + return EV_ERROR; + } + + for_each(w, ev->shead.next) + { + if (w->signum == signo) + { + w->cb(ev, w, 0); + return EV_OK; + } + } + + pgagroal_log_error("No handler found for signal %d\n", signo); + return EV_ERROR; +} + +static int +__epoll_periodic_init(struct ev_periodic* w, int msec) +{ + struct timespec now; + struct itimerspec new_value; + + if (clock_gettime(CLOCK_MONOTONIC, &now) == -1) /* TODO: evaluate what kind of clock to use (!) */ + { + pgagroal_log_error("clock_gettime"); + return EV_ERROR; + } + + new_value.it_value.tv_sec = msec / 1000; + new_value.it_value.tv_nsec = (msec % 1000) * 1000000; + + new_value.it_interval.tv_sec = msec / 1000; + new_value.it_interval.tv_nsec = (msec % 1000) * 1000000; + + w->fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); /* no need to set it to non-blocking due to TFD_NONBLOCK */ + if (w->fd == -1) + { + perror("timerfd_create"); + return EV_ERROR; + } + + if (timerfd_settime(w->fd, 0, &new_value, NULL) == -1) + { + perror("timerfd_settime"); + close(w->fd); + return EV_ERROR; + } + return EV_OK; +} + +static int +__epoll_periodic_start(struct ev_loop* loop, struct ev_periodic* w) +{ + struct epoll_event event; + event.events = EPOLLIN; /* | EPOLLET */ + event.data.u64 = (uint64_t)w; + if (epoll_ctl(loop->epollfd, EPOLL_CTL_ADD, w->fd, &event) == -1) + { + perror("epoll_ctl"); + close(w->fd); + return EV_ERROR; + } + return EV_OK; +} + +static int +__epoll_periodic_stop(struct ev_loop* loop, struct ev_periodic* w) +{ + if (epoll_ctl(loop->epollfd, EPOLL_CTL_DEL, w->fd, NULL) == -1) + { + pgagroal_log_error("%s: epoll_ctl: delete failed", __func__); + return EV_ERROR; + } + return EV_OK; +} + +static int +__epoll_periodic_handler(struct ev_loop* ev, struct ev_periodic* w) +{ + uint64_t exp; + int nread = read(w->fd, &exp, sizeof(uint64_t)); + if (nread != sizeof(uint64_t)) + { + pgagroal_log_error("periodic_handler: read"); + return EV_ERROR; + } + w->cb(ev, w, 0); + return EV_OK; +} + +static int +__epoll_io_start(struct ev_loop* ev, struct ev_io* w) +{ + struct epoll_event event; + switch (w->type) + { + case EV_ACCEPT: + case EV_RECEIVE: + event.events = EPOLLIN; /* | EPOLLET */ + break; + case EV_SEND: + event.events = EPOLLOUT; /* | EPOLLET */ + break; + default: + pgagroal_log_fatal("%s: unknown event type: %d\n", __func__, w->type); + return EV_ERROR; + } + if (set_non_blocking(w->fd)) /* TODO: err handling */ + { + pgagroal_log_fatal("%s: set_non_blocking"); + exit(1); + } + event.data.u64 = (uint64_t)w; + + if (epoll_ctl(ev->epollfd, EPOLL_CTL_ADD, w->fd, &event) == -1) + { + pgagroal_log_fatal("%s: epoll_ctl"); + exit(1); + close(w->fd); + return EV_ERROR; + } + return EV_OK; +} + +static int +__epoll_io_stop(struct ev_loop* ev, struct ev_io* target) +{ + int ret = EV_OK; + bool fd_is_open = fcntl(target->fd, F_GETFD) != -1 || errno != EBADF; + + /* TODO: pgagroal deals with closing fds, so dealing with EPOLL_CTL_DEL may be unnecessary */ + if (fd_is_open) + { + if (epoll_ctl(ev->epollfd, EPOLL_CTL_DEL, target->fd, NULL) == -1) + { + ret = EV_ERROR; + } + } + + return ret; +} + +static int +__epoll_io_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + switch (w->type) + { + case EV_ACCEPT: + return __epoll_accept_handler(ev, w); + case EV_SEND: + return __epoll_send_handler(ev, w); + case EV_RECEIVE: + ret = __epoll_receive_handler(ev, w); + switch (ret) + { + case EV_CLOSE_FD: /* connection closed */ + /* pgagroal deals with closing fd, so either remove this here or there */ + break; + } + break; + default: + pgagroal_log_fatal("%s: unknown value for event type %d\n", __func__); + } + + return ret; +} + +static int +__epoll_receive_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + int nrecv = 0; + int total_recv = 0; + void* buf = ev->buffer; + if (!buf) + { + perror("malloc error"); + return EV_ALLOC_ERROR; + } + + if (!w->ssl) + { + while (1) + { + nrecv = recv(w->fd, buf + total_recv, MAX_BUFFER_SIZE, 0); + if (nrecv == -1) + { + if (errno != EAGAIN && errno != EWOULDBLOCK) + { + pgagroal_log_error("receive_handler: recv\n"); + } + break; + } + else if (nrecv == 0) /* connection closed */ + { + ret = EV_CLOSE_FD; + pgagroal_log_info("Connection closed fd_in=%d fd_out=%d\n", w->fd, w->fd_out); + break; + } + + total_recv += nrecv; + } + w->data = buf; + w->size = total_recv; + } + + w->cb(ev, w, ret); + return ret; +} + +static int +__epoll_accept_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + int listen_fd = w->fd; + + /* TODO: check again if needed: + * + * struct sockaddr_in client_addr; + * socklen_t client_len = sizeof(client_addr); + */ + + while (1) + { + w->fd_out = accept(listen_fd, NULL, NULL); + if (w->fd_out == -1) + { + /* + * NOTE: pgagroal deals with accept returning -1 + */ + if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) + { + ret = EV_OK; + } + else + { + ret = EV_ERROR; + } + errno = 0; + break; + } + w->cb(ev, w, ret); + } + + return ret; +} + +static int +__epoll_send_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + ssize_t nsent; + size_t total_sent = 0; + int fd = w->fd; + void* buf = w->data; + size_t buf_len = w->size; + + if (!w->ssl) + { + while (total_sent < buf_len) + { + nsent = send(fd, buf + total_sent, buf_len - total_sent, 0); + if (nsent == -1) + { + if (errno != EAGAIN && errno != EWOULDBLOCK) + { + perror("send"); + ret = EV_ERROR; + break; + } + else if (errno == EPIPE) + { + ret = EV_CLOSE_FD; + } + } + else + { + total_sent += nsent; + } + } + } + + /* + * NOTE: Maybe there is an advantage in rearming here since the loop uses non blocking sockets. + * But I don't know the case where error occurred and exited the loop and can be recovered. + * + * Example: + * if (total_sent < buf_len) + * pgagroal_io_send_init(w, fd, cb, buf + total_sent, buf_len, 0); + */ + + return ret; +} +#endif + +#if HAVE_KQUEUE + +int +__kqueue_loop(struct ev_loop* ev) +{ + int ret; + int nfds; + struct kevent events[MAX_EVENTS]; + struct timespec timeout; + timeout.tv_sec = 0; + timeout.tv_nsec = 10000000; /* 10 ms */ + + set_running(ev); + while (is_running(ev)) + { + nfds = kevent(ev->kqueuefd, NULL, 0, events, MAX_EVENTS, &timeout); + + if (nfds == -1) + { + if (errno == EINTR) + { + continue; + } + pgagroal_log_error("kevent"); + return EV_ERROR; + } + + if (!is_running(ev)) + { + break; + } + for (int i = 0; i < nfds; i++) + { + ret = __kqueue_handler(ev, &events[i]); + /* Currently closing the main connection fd means that pgagroal loop + * will stop, so just return an error to the caller. If the caller + * eventually decides to continue the loop, the + * caller will have to handle this error. + */ + if (ret == EV_CLOSE_FD) + { + return EV_CLOSE_FD; + } + if (ret == EV_ERROR) + { + pgagroal_log_fatal("kqueue_handler"); + return EV_ERROR; + } + } + } + return EV_OK; +} + +static int +__kqueue_init(struct ev_loop* ev) +{ + ev->buffer = malloc(sizeof(char) * (MAX_BUFFER_SIZE)); + ev->kqueuefd = kqueue(); + if (ev->kqueuefd == -1) + { + pgagroal_log_error("kqueue init error"); + return EV_ERROR; + } + return EV_OK; +} + +static int +__kqueue_fork(struct ev_loop** parent_loop) +{ + /* TODO: Destroy everything related to loop */ + close((*parent_loop)->kqueuefd); + return EV_OK; +} + +static int +__kqueue_destroy(struct ev_loop* ev) +{ + close(ev->kqueuefd); + free(ev->buffer); + free(ev); + return EV_OK; +} + +static int +__kqueue_handler(struct ev_loop* ev, struct kevent* kev) +{ + if (kev->filter == EVFILT_TIMER) + { + return __kqueue_periodic_handler(ev, kev); + } + else if (kev->filter == EVFILT_SIGNAL) + { + return __kqueue_signal_handler(ev, kev); + } + else if (kev->filter == EVFILT_READ || kev->filter == EVFILT_WRITE) + { + return __kqueue_io_handler(ev, kev); + } + else + { + pgagroal_log_error("Unknown filter in handler"); + return EV_ERROR; + } +} + +static int +__kqueue_signal_start(struct ev_loop* ev, struct ev_signal* w) +{ + struct kevent kev; + + EV_SET(&kev, w->signum, EVFILT_SIGNAL, EV_ADD, 0, 0, w); + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("kevent: signal add"); + return EV_ERROR; + } + return EV_OK; +} + +static int +__kqueue_signal_stop(struct ev_loop* ev, struct ev_signal* w) +{ + struct kevent kev; + + EV_SET(&kev, w->signum, EVFILT_SIGNAL, EV_DELETE, 0, 0, w); + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("kevent: signal delete"); + return EV_ERROR; + } + return EV_OK; +} + +static int +__kqueue_signal_handler(struct ev_loop* ev, struct kevent* kev) +{ + struct ev_signal* w = (struct ev_signal*)kev->udata; + + if (w->signum == (int)kev->ident) + { + w->cb(ev, w, 0); + return EV_OK; + } + else + { + pgagroal_log_error("No handler found for signal %d", (int)kev->ident); + return EV_ERROR; + } +} + +static int +__kqueue_periodic_init(struct ev_periodic* w, int msec) +{ + w->interval = msec; + return EV_OK; +} + +static int +__kqueue_periodic_start(struct ev_loop* ev, struct ev_periodic* w) +{ + struct kevent kev; + EV_SET(&kev, (uintptr_t)w, EVFILT_TIMER, EV_ADD | EV_ENABLE, NOTE_USECONDS, w->interval * 1000, w); + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("kevent: timer add"); + return EV_ERROR; + } + return EV_OK; +} + +static int +__kqueue_periodic_stop(struct ev_loop* ev, struct ev_periodic* w) +{ + struct kevent kev; + EV_SET(&kev, (uintptr_t)w, EVFILT_TIMER, EV_DELETE, 0, 0, NULL); + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("kevent: timer delete"); + return EV_ERROR; + } + + return EV_OK; +} + +static int +__kqueue_periodic_handler(struct ev_loop* ev, struct kevent* kev) +{ + struct ev_periodic* w = (struct ev_periodic*)kev->udata; + w->cb(ev, w, 0); + return EV_OK; +} + +static int +__kqueue_io_start(struct ev_loop* ev, struct ev_io* w) +{ + struct kevent kev; + int filter; + + switch (w->type) + { + case EV_ACCEPT: + case EV_RECEIVE: + filter = EVFILT_READ; + break; + case EV_SEND: + filter = EVFILT_WRITE; + break; + default: + pgagroal_log_fatal("%s: unknown event type: %d\n", __func__, w->type); + return EV_ERROR; + } + + if (set_non_blocking(w->fd)) + { + pgagroal_log_fatal("%s: set_non_blocking", __func__); + return EV_ERROR; + } + + EV_SET(&kev, w->fd, filter, EV_ADD | EV_ENABLE | EV_CLEAR, 0, 0, w); + + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("%s: kevent add failed", __func__); + return EV_ERROR; + } + + return EV_OK; +} + +static int +__kqueue_io_stop(struct ev_loop* ev, struct ev_io* w) +{ + struct kevent kev; + int filter; + + switch (w->type) + { + case EV_ACCEPT: + case EV_RECEIVE: + filter = EVFILT_READ; + break; + case EV_SEND: + filter = EVFILT_WRITE; + break; + default: + pgagroal_log_fatal("%s: unknown event type: %d\n", __func__, w->type); + return EV_ERROR; + } + + EV_SET(&kev, w->fd, filter, EV_DELETE, 0, 0, NULL); + + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("%s: kevent delete failed", __func__); + return EV_ERROR; + } + + return EV_OK; +} + +static int +__kqueue_io_handler(struct ev_loop* ev, struct kevent* kev) +{ + struct ev_io* w = (struct ev_io*)kev->udata; + int ret = EV_OK; + + switch (w->type) + { + case EV_ACCEPT: + ret = __kqueue_accept_handler(ev, w); + break; + case EV_SEND: + ret = __kqueue_send_handler(ev, w); + break; + case EV_RECEIVE: + ret = __kqueue_receive_handler(ev, w); + break; + default: + pgagroal_log_fatal("%s: unknown value for event type %d\n", __func__, w->type); + ret = EV_ERROR; + break; + } + + return ret; +} + +static int +__kqueue_receive_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + ssize_t nrecv = 0; + size_t total_recv = 0; + void* buf = ev->buffer; + + if (!buf) + { + pgagroal_log_error("malloc error"); + return EV_ALLOC_ERROR; + } + + if (!w->ssl) + { + while (1) + { + nrecv = recv(w->fd, buf + total_recv, MAX_BUFFER_SIZE - total_recv, 0); + if (nrecv == -1) + { + if (errno == EAGAIN || errno == EWOULDBLOCK) + { + break; + } + else + { + pgagroal_log_error("receive_handler: recv"); + ret = EV_ERROR; + break; + } + } + else if (nrecv == 0) + { + ret = EV_CLOSE_FD; + // pgagroal_log_trace("Connection closed fd=%d client_fd=%d\n", w->fd, w->fd_out); + break; + } + + total_recv += nrecv; + + if (total_recv >= MAX_BUFFER_SIZE) + { + pgagroal_log_error("receive_handler: buffer overflow"); + ret = EV_ERROR; + break; + } + } + + w->data = buf; + w->size = total_recv; + } + + w->cb(ev, w, ret); + return ret; +} + +static int +__kqueue_accept_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + int listen_fd = w->fd; + + while (1) + { + w->fd_out = accept(listen_fd, NULL, NULL); + if (w->fd_out == -1) + { + if (errno == EAGAIN || errno == EWOULDBLOCK) + { + ret = EV_OK; + break; + } + else + { + pgagroal_log_error("accept_handler: accept"); + ret = EV_ERROR; + break; + } + } + else + { + w->cb(ev, w, ret); + } + } + + return ret; +} + +static int +__kqueue_send_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + /* TODO: remove since unused */ + return ret; +} + +#endif diff --git a/src/include/configuration.h b/src/include/configuration.h index 9717eb48..11feb780 100644 --- a/src/include/configuration.h +++ b/src/include/configuration.h @@ -111,7 +111,7 @@ extern "C" { #define CONFIGURATION_ARGUMENT_TLS_CERT_FILE "tls_cert_file" #define CONFIGURATION_ARGUMENT_TLS_KEY_FILE "tls_key_file" #define CONFIGURATION_ARGUMENT_TLS_CA_FILE "tls_ca_file" -#define CONFIGURATION_ARGUMENT_LIBEV "libev" +#define CONFIGURATION_ARGUMENT_EV_BACKEND "ev_backend" #define CONFIGURATION_ARGUMENT_KEEP_ALIVE "keep_alive" #define CONFIGURATION_ARGUMENT_NODELAY "nodelay" #define CONFIGURATION_ARGUMENT_NON_BLOCKING "non_blocking" diff --git a/src/include/ev.h b/src/include/ev.h new file mode 100644 index 00000000..4805e957 --- /dev/null +++ b/src/include/ev.h @@ -0,0 +1,413 @@ +/* + * Copyright (C) 2024 The pgagroal community + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may + * be used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef EV_H +#define EV_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* pgagroal */ +#include + +/* system */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_LINUX +#include +#include +#include +#endif /* HAVE_LINUX */ + +#define ALIGNMENT sysconf(_SC_PAGESIZE) + +#define BUFFER_COUNT 2 +#define MAX_EVENTS 128 + +/** + * Constants used to define the supported + * event backends. + */ + +typedef enum ev_backend +{ + EV_BACKEND_AUTO = 0, + EV_BACKEND_IO_URING, + EV_BACKEND_EPOLL, + EV_BACKEND_KQUEUE, +} ev_backend_t; + +#if HAVE_LINUX +#define DEFAULT_EV_BACKEND EV_BACKEND_IO_URING +#else +#define DEFAULT_EV_BACKEND EV_BACKEND_KQUEUE +#endif + +enum ev_type { + EV_INVALID = 0, + EV_ACCEPT, + EV_RECEIVE, + EV_SEND, + EV_SIGNAL, + EV_PERIODIC, +}; + +enum ev_return_codes { + EV_OK = 0, + EV_ERROR, + EV_CONNECTION_CLOSED, + EV_REPLENISH_BUFFERS, + EV_REARMED, + EV_ALLOC_ERROR, +}; + +/** + * @union sockaddr_u + * @brief Socket address union for IPv4 and IPv6 + * + * Stores either an IPv4 or IPv6 socket address + */ +union sockaddr_u +{ + struct sockaddr_in addr4; /**< IPv4 socket address structure. */ + struct sockaddr_in6 addr6; /**< IPv6 socket address structure. */ +}; + +struct ev_loop; + +/** + * @struct ev_io + * @brief I/O watcher for the event loop + * + * Monitors file descriptors for I/O readiness events (e.g., read or write) + */ +typedef struct ev_io +{ + enum ev_type type; /**< Event type. */ + int fd; /**< File descriptor to watch. */ + int client_fd; /**< Client's file descriptor, if applicable. */ + void* data; /**< Pointer to received data. */ + int size; /**< Size of the data buffer. */ + bool ssl; /**< Indicates if SSL/TLS is used on this connection. */ + struct ev_io* next; /**< Pointer to the next watcher in the linked list. */ + void (*cb)(struct ev_loop*, struct ev_io* watcher, int err); /**< Event callback. */ +} ev_io; + +/** + * @struct ev_signal + * @brief Signal watcher for the event loop + * + * Monitors and handles specific signals received by the process + */ +typedef struct ev_signal +{ + enum ev_type type; /**< Event type. */ + int signum; /**< Signal number to watch for. */ + struct ev_signal* next; /**< Pointer to the next signal watcher. */ + void (*cb)(struct ev_loop*, struct ev_signal* watcher, int err); /**< Event callback. */ +} ev_signal; + +/** + * @struct ev_periodic + * @brief Periodic timer watcher for the event loop + * + * Triggers callbacks at regular intervals specified in milliseconds + */ +typedef struct ev_periodic +{ + enum ev_type type; /**< Event type. */ +#if HAVE_LINUX + struct __kernel_timespec ts; /**< Timespec struct for io_uring loop. */ + int fd; /**< File descriptor for epoll-based periodic watcher. */ +#else + int interval; /**< Interval for kqueue timer. */ +#endif /* HAVE_LINUX */ + struct ev_periodic* next; /**< Pointer to the next periodic watcher. */ + void (*cb)(struct ev_loop*, struct ev_periodic* watcher, int err); /**< Event callback. */ +} ev_periodic; + +/** + * @union ev_watcher + * @brief General watcher union for the event loop + */ +typedef union ev_watcher +{ + struct ev_io* io; /**< Pointer to an I/O watcher. */ + struct ev_signal* signal; /**< Pointer to a signal watcher. */ + struct ev_periodic* periodic; /**< Pointer to a periodic watcher. */ +} ev_watcher; + +#if HAVE_LINUX +/** + * @struct io_buf_ring + * @brief Represents a buffer ring for I/O operations with io_uring. + * + * The io_buf_ring structure holds pointers to an io_uring buffer ring and + * a generic buffer, along with a buffer group ID (bgid). + */ +struct io_buf_ring +{ + struct io_uring_buf_ring* br; /**< Pointer to the io_uring buffer ring internal structure. */ + void* buf; /**< Pointer to the buffer used for I/O operations. */ +}; +#endif /* HAVE_LINUX */ + +/** + * @struct ev_ops + * @brief Event loop backend operations + * + * Contains function pointers for initializing and controlling the event loop, + * allowing for different backend implementations. + */ +struct ev_ops +{ + int (*init)(struct ev_loop* loop); /**< Initializes the event loop backend. */ + int (*loop)(struct ev_loop* loop); /**< Runs the event loop, processing events. */ + int (*io_start)(struct ev_loop* loop, struct ev_io* watcher); /**< Starts an I/O watcher in the event loop. */ + int (*io_stop)(struct ev_loop* loop, struct ev_io* watcher); /**< Stops an I/O watcher in the event loop. */ + int (*signal_init)(struct ev_loop* loop, struct ev_signal* watcher); /**< Initializes a signal watcher. */ + int (*signal_start)(struct ev_loop* loop, struct ev_signal* watcher); /**< Starts a signal watcher in the event loop. */ + int (*signal_stop)(struct ev_loop* loop, struct ev_signal* watcher); /**< Stops a signal watcher in the event loop. */ + int (*periodic_init)(struct ev_loop* loop, struct ev_periodic* watcher); /**< Initializes a periodic watcher. */ + int (*periodic_start)(struct ev_loop* loop, struct ev_periodic* watcher); /**< Starts a periodic watcher in the event loop. */ + int (*periodic_stop)(struct ev_loop* loop, struct ev_periodic* watcher); /**< Stops a periodic watcher in the event loop. */ +}; + +/** + * @struct ev_loop + * @brief Main event loop structure. + * + * Manages the event loop, including I/O, signal, and periodic watchers. + * It handles the execution and coordination of events using the specified backend. + */ +struct ev_loop +{ + volatile bool running; /**< Flag indicating if the event loop is running. */ + atomic_bool atomic_running; /**< Atomic flag for thread-safe running state. */ + struct ev_io ihead; /**< Head of the I/O watchers linked list. */ + struct ev_signal shead; /**< Head of the signal watchers linked list. */ + struct ev_periodic phead; /**< Head of the signal watchers linked list. */ + sigset_t sigset; /**< Signal set used for handling signals in the event loop. */ + struct ev_ops ops; /**< Backend operations for the event loop. */ +#if HAVE_LINUX + struct io_uring_cqe* cqe; + struct io_uring ring; + struct io_buf_ring br; + int bid; /**< io_uring: Next buffer id. */ + /** + * TODO: Implement iovecs. + * int iovecs_nr; + * struct iovec *iovecs; + */ + int epollfd; /**< File descriptor for the epoll instance (used with epoll backend). */ +#else + int kqueuefd; /**< File descriptor for the kqueue instance (used with kqueue backend). */ +#endif /* HAVE_LINUX */ + void* buffer; /**< Pointer to a buffer used to read in bytes. */ + +}; + +typedef void (*io_cb)(struct ev_loop*, struct ev_io* watcher, int err); +typedef void (*signal_cb)(struct ev_loop*, struct ev_signal* watcher, int err); +typedef void (*periodic_cb)(struct ev_loop*, struct ev_periodic* watcher, int err); + +/** + * Initialize a new event loop + * @param config Pointer to the configuration struct + * @return Pointer to the initialized event loop + */ +struct ev_loop* +pgagroal_ev_init(void); + +/** + * Start the main event loop + * @param loop Pointer to the event loop struct + * @return Return code + */ +int +pgagroal_ev_loop(struct ev_loop* loop); + +/** + * Break the event loop, stopping its execution + * @param loop Pointer to the event loop struct + */ +void +pgagroal_ev_loop_break(struct ev_loop* loop); + +/** + * Destroy the event loop, freeing only the strictly necessary resources that + * need to be freed. + * + * @param loop Pointer to the event loop struct + * @return Return code + */ +int +pgagroal_ev_loop_destroy(struct ev_loop* loop); + +/** + * Closes the file descriptors used by the loop of the parent process. + * + * @param loop Pointer to the loop that should be freed by the child process + * @return Return code + */ +int +pgagroal_ev_fork(struct ev_loop* loop); + +/** + * Check if the event loop is currently running + * @param loop Pointer to the event loop struct + * @return True if the loop is running, false otherwise + */ +bool +pgagroal_ev_loop_is_running(struct ev_loop* loop); + +/** + * Atomically check if the event loop is running + * @param loop Pointer to the event loop struct + * @return True if the loop is running, false otherwise + */ +bool +pgagroal_ev_atomic_loop_is_running(struct ev_loop* loop); + +/** + * Initialize the watcher for accept event + * @param w Pointer to the io event watcher struct + * @param fd File descriptor being watched + * @param cb Callback executed when event completes + * @return Return code + */ +int +pgagroal_ev_io_accept_init(struct ev_io* w, int fd, io_cb cb); + +/** + * Initialize the watcher for receive events + * @param w Pointer to the io event watcher struct + * @param fd File descriptor being watched + * @param cb Callback executed when event completes + * @return Return code + */ +int +pgagroal_ev_io_receive_init(struct ev_io* w, int fd, io_cb cb); + +/** + * Initialize the watcher for sending IO operations + * @param w Pointer to the io event watcher struct + * @param fd File descriptor being watched + * @param cb Callback executed when event completes + * @param buf Pointer to the buffer to be sent + * @param buf_len Length of the buffer to be sent + * @return Return code + */ +int +pgagroal_ev_io_send_init(struct ev_io* w, int fd, io_cb cb, void* buf, int buf_len); + +/** + * Start the watcher for an IO event in the event loop + * @param loop Pointer to the event loop struct + * @param w Pointer to the io event watcher struct + * @return Return code + */ +int +pgagroal_ev_io_start(struct ev_loop* loop, struct ev_io* w); + +/** + * Stop the watcher for an IO event in the event loop + * @param loop Pointer to the event loop struct + * @param w Pointer to the io event watcher struct + * @return Return code + */ +int +pgagroal_ev_io_stop(struct ev_loop* loop, struct ev_io* w); + +/** + * Initialize the watcher for periodic timeout events + * @param w Pointer to the periodic event watcher struct + * @param cb Callback executed on timeout + * @param msec Interval in milliseconds for the periodic event + * @return Return code + */ +int +pgagroal_ev_periodic_init(struct ev_periodic* w, periodic_cb cb, int msec); + +/** + * Start the watcher for a periodic timeout in the event loop + * @param loop Pointer to the event loop struct + * @param w Pointer to the periodic event watcher struct + * @return Return code + */ +int +pgagroal_ev_periodic_start(struct ev_loop* loop, struct ev_periodic* w); + +/** + * Stop the watcher for a periodic timeout in the event loop + * @param loop Pointer to the event loop struct + * @param w Pointer to the periodic event watcher struct + * @return Return code + */ +int +pgagroal_ev_periodic_stop(struct ev_loop* loop, struct ev_periodic* w); + +/** + * Initialize the watcher for signal events + * @param w Pointer to the signal event watcher struct + * @param cb Callback executed when signal is received + * @param signum Signal number to watch + * @return Return code + */ +int +pgagroal_ev_signal_init(struct ev_signal* w, signal_cb cb, int signum); + +/** + * Start the watcher for a signal in the event loop + * @param loop Pointer to the event loop struct + * @param w Pointer to the signal event watcher struct + * @return Return code + */ +int +pgagroal_ev_signal_start(struct ev_loop* loop, struct ev_signal* w); + +/** + * Stop the watcher for a signal in the event loop + * @param loop Pointer to the event loop struct + * @param w Pointer to the signal event watcher struct + * @return Return code + */ +int +pgagroal_ev_signal_stop(struct ev_loop* loop, struct ev_signal* w); + +#endif /* EV_H */ diff --git a/src/include/message.h b/src/include/message.h index 71dc027b..3d55e346 100644 --- a/src/include/message.h +++ b/src/include/message.h @@ -394,6 +394,15 @@ pgagroal_log_message(struct message* msg); int pgagroal_read_socket_message(int socket, struct message** msg); +/** + * Read a message from a buffer + * @param buffer The buffer to "copy" from + * @param msg The resulting message + * @return One of MESSAGE_STATUS_ZERO, MESSAGE_STATUS_OK or MESSAGE_STATUS_ERROR + */ +int +pgagroal_buffer_to_message(void* data, ssize_t size, struct message** msg); + /** * Write a message using a socket * @param socket The socket descriptor diff --git a/src/include/network.h b/src/include/network.h index 5fe0e4d3..ebed080e 100644 --- a/src/include/network.h +++ b/src/include/network.h @@ -33,6 +33,7 @@ extern "C" { #endif +#include #include #include diff --git a/src/include/pgagroal.h b/src/include/pgagroal.h index 4df183b9..fc853373 100644 --- a/src/include/pgagroal.h +++ b/src/include/pgagroal.h @@ -33,7 +33,6 @@ extern "C" { #endif -#include #include #include #include @@ -555,7 +554,7 @@ struct main_configuration bool disconnect_client_force; /**< Force a disconnect client if active for more than the specified seconds */ char pidfile[MAX_PATH]; /**< File containing the PID */ - char libev[MISC_LENGTH]; /**< Name of libev mode */ + int ev_backend; /**< Selected ev backend */ bool keep_alive; /**< Use keep alive */ bool nodelay; /**< Use NODELAY */ bool non_blocking; /**< Use non blocking */ diff --git a/src/include/pipeline.h b/src/include/pipeline.h index 589c2e42..178f53e6 100644 --- a/src/include/pipeline.h +++ b/src/include/pipeline.h @@ -33,9 +33,9 @@ extern "C" { #endif +#include #include -#include #include #define PIPELINE_AUTO -1 diff --git a/src/include/utils.h b/src/include/utils.h index 6b26a245..0bf4235d 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -34,6 +34,7 @@ extern "C" { #endif #include +#include #include #include @@ -43,7 +44,7 @@ extern "C" { */ struct signal_info { - struct ev_signal signal; /**< The libev base type */ + struct ev_signal signal; /**< The ev backend base type */ int slot; /**< The slot */ }; @@ -275,28 +276,6 @@ pgagroal_bigendian(void); unsigned int pgagroal_swap(unsigned int i); -/** - * Print the available libev engines - */ -void -pgagroal_libev_engines(void); - -/** - * Get the constant for a libev engine - * @param engine The name of the engine - * @return The constant - */ -unsigned int -pgagroal_libev(char* engine); - -/** - * Get the name for a libev engine - * @param val The constant - * @return The name - */ -char* -pgagroal_libev_engine(unsigned int val); - /** * Get the home directory * @return The directory diff --git a/src/include/worker.h b/src/include/worker.h index 5478293e..62b07f7f 100644 --- a/src/include/worker.h +++ b/src/include/worker.h @@ -51,7 +51,7 @@ extern "C" { */ struct worker_io { - struct ev_io io; /**< The libev base type */ + struct ev_io io; /**< The ev backend base type */ int client_fd; /**< The client descriptor */ int server_fd; /**< The server descriptor */ int slot; /**< The slot */ diff --git a/src/libpgagroal/configuration.c b/src/libpgagroal/configuration.c index c8d68e2a..2109dd13 100644 --- a/src/libpgagroal/configuration.c +++ b/src/libpgagroal/configuration.c @@ -115,6 +115,10 @@ static int to_log_type(char* where, int value); static void add_configuration_response(struct json* res); static void add_servers_configuration_response(struct json* res); +static bool is_supported_backend(ev_backend_t backend); +static char* to_backend_str(ev_backend_t value); +static ev_backend_t to_backend_type(char* str); + /** * */ @@ -159,6 +163,8 @@ pgagroal_init_configuration(void* shm) config->tracker = false; config->track_prepared_statements = false; + config->ev_backend = EV_BACKEND_AUTO; + config->common.log_type = PGAGROAL_LOGGING_TYPE_CONSOLE; config->common.log_level = PGAGROAL_LOGGING_LEVEL_INFO; config->common.log_connections = false; @@ -380,6 +386,8 @@ pgagroal_validate_configuration(void* shm, bool has_unix_socket, bool has_main_s bool tls; struct stat st; struct main_configuration* config; + int fd; + char rval; tls = false; @@ -711,6 +719,48 @@ pgagroal_validate_configuration(void* shm, bool has_unix_socket, bool has_main_s } } + if (config->ev_backend == EV_BACKEND_AUTO || !is_supported_backend(config->ev_backend)) + { + config->ev_backend = DEFAULT_EV_BACKEND; + } + + if (config->ev_backend == EV_BACKEND_IO_URING) + { + /* check if io_uring is enabled or works for supported configuration, else fallback to next backend */ + fd = open("/proc/sys/kernel/io_uring_disabled", O_RDONLY); + if (fd < 0) + { + pgagroal_log_debug("Failed to open file /proc/sys/kernel/io_uring_disabled: %s", strerror(errno)); + goto fallback; + } + if (read(fd, &rval, 1) <= 0) + { + pgagroal_log_fatal("Failed to read file /proc/sys/kernel/io_uring_disabled"); + return 1; + } + if (close(fd) < 0) + { + pgagroal_log_fatal("Failed to close file descriptor for /proc/sys/kernel/io_uring_disabled: %s", strerror(errno)); + return 1; + } + + /* see doc: https://docs.kernel.org/admin-guide/sysctl/kernel.html#io-uring-disabled */ + if (config->common.tls || (rval == '1') || (rval == '2')) + { + if (config->common.tls) + { + pgagroal_log_warn("io_uring not supported with tls on"); + } + else + { + pgagroal_log_warn("io_uring supported but not enabled. Enable io_uring by setting /proc/sys/kernel/io_uring_disabled to '0'"); + } +fallback: + config->ev_backend = EV_BACKEND_EPOLL; + } + } + pgagroal_log_debug("Selected backend '%s'", to_backend_str(config->ev_backend)); + // do some last initialization here, since the configuration // looks good so far pgagroal_init_pidfile_if_needed(); @@ -2681,15 +2731,21 @@ transfer_configuration(struct main_configuration* config, struct main_configurat changed = true; } - /* libev */ - if (restart_string("libev", config->libev, reload->libev, true)) - { - changed = true; - } - config->keep_alive = reload->keep_alive; - config->nodelay = reload->nodelay; - config->non_blocking = reload->non_blocking; - config->backlog = reload->backlog; + /* ev backend */ + /* + * TODO: implementation of ev_backend for transfer configuration. + * previous implementation for libev is commented here for + * reference. + * + * NOTE: use restart_{} + * + * restart_string("ev_backend", config->ev_backend, reload->ev_backend, true); + * config->buffer_size = reload->buffer_size; + * config->keep_alive = reload->keep_alive; + * config->nodelay = reload->nodelay; + * config->non_blocking = reload->non_blocking; + * config->backlog = reload->backlog; + */ /* hugepage */ if (restart_int("hugepage", config->common.hugepage, reload->common.hugepage)) { @@ -2835,6 +2891,36 @@ is_same_tls(struct server* src, struct server* dst) } } +/** + * Checks if event backend is supported. + * @return true if supported, false otherwise + */ +static bool +is_supported_backend(ev_backend_t backend) +{ + int bi, backends; + ev_backend_t supported_backends[] = { +#if HAVE_LINUX + EV_BACKEND_IO_URING, + EV_BACKEND_EPOLL, +#else + EV_BACKEND_KQUEUE, +#endif + }; + backends = sizeof(supported_backends) / sizeof(supported_backends[0]); + + for (bi = 0; bi < backends; bi++) + { + if (backend == supported_backends[bi]) + { + return true; + } + } + + pgagroal_log_warn("Configured backend is unsupported"); + return false; +} + static void copy_server(struct server* dst, struct server* src) { @@ -4315,6 +4401,76 @@ to_log_type(char* where, int value) return 0; } +/** + * Convert a string description into ev_backend_t. + * + * @param str The string representing the event backend + * @return The corresponding ev_backend_t value for the given string. If the input + * string is not recognized, EV_BACKEND_AUTO is returned + */ +static ev_backend_t +to_backend_type(char* str) +{ + if (is_empty_string(str)) + { + pgagroal_log_warn("ev_backend configuration is empty. Default to 'auto'"); + return EV_BACKEND_AUTO; + } + if (!strncmp(str, "auto", MISC_LENGTH)) + { + pgagroal_log_debug("Configured event backend 'auto'"); + return EV_BACKEND_AUTO; + } + if (!strncmp(str, "io_uring", MISC_LENGTH)) + { + pgagroal_log_debug("Configured event backend 'io_uring'"); + return EV_BACKEND_IO_URING; + } + if (!strncmp(str, "epoll", MISC_LENGTH)) + { + pgagroal_log_debug("Configured event backend 'epoll'"); + return EV_BACKEND_EPOLL; + } + if (!strncmp(str, "kqueue", MISC_LENGTH)) + { + pgagroal_log_debug("Configured event backend 'kqueue'"); + return EV_BACKEND_KQUEUE; + } + + pgagroal_log_warn("Configured event backend '%s' not supported. Default to 'auto'", str); + return EV_BACKEND_AUTO; +} + +/** + * Convert ev_backend_t to its string description. + * + * @param value The ev_backend_t enum value + * @return A string describing the ev_backend_t value. If the value is invalid + * or not recognized, the function returns "auto" + */ +static char* +to_backend_str(ev_backend_t value) +{ + if (value < 0) + { + return "auto"; + } + + switch (value) + { + case EV_BACKEND_AUTO: + return "auto"; + case EV_BACKEND_IO_URING: + return "io_uring"; + case EV_BACKEND_EPOLL: + return "epoll"; + case EV_BACKEND_KQUEUE: + return "kqueue"; + } + + return "auto"; +} + int pgagroal_apply_main_configuration(struct main_configuration* config, struct server* srv, @@ -4675,15 +4831,9 @@ pgagroal_apply_main_configuration(struct main_configuration* config, } memcpy(config->unix_socket_dir, value, max); } - else if (key_in_section("libev", section, key, true, &unknown)) + else if (key_in_section("ev_backend", section, key, true, &unknown)) { - - max = strlen(value); - if (max > MISC_LENGTH - 1) - { - max = MISC_LENGTH - 1; - } - memcpy(config->libev, value, max); + config->ev_backend = to_backend_type(value); } else if (key_in_section("keep_alive", section, key, true, &unknown)) { @@ -5351,7 +5501,7 @@ add_configuration_response(struct json* res) pgagroal_json_put(res, CONFIGURATION_ARGUMENT_TLS_CERT_FILE, (uintptr_t)config->common.tls_cert_file, ValueString); pgagroal_json_put(res, CONFIGURATION_ARGUMENT_TLS_KEY_FILE, (uintptr_t)config->common.tls_key_file, ValueString); pgagroal_json_put(res, CONFIGURATION_ARGUMENT_TLS_CA_FILE, (uintptr_t)config->common.tls_ca_file, ValueString); - pgagroal_json_put(res, CONFIGURATION_ARGUMENT_LIBEV, (uintptr_t)config->libev, ValueString); + pgagroal_json_put(res, CONFIGURATION_ARGUMENT_EV_BACKEND, (uintptr_t)to_backend_str(config->ev_backend), ValueString); pgagroal_json_put(res, CONFIGURATION_ARGUMENT_KEEP_ALIVE, (uintptr_t)config->keep_alive, ValueBool); pgagroal_json_put(res, CONFIGURATION_ARGUMENT_NODELAY, (uintptr_t)config->nodelay, ValueBool); pgagroal_json_put(res, CONFIGURATION_ARGUMENT_NON_BLOCKING, (uintptr_t)config->non_blocking, ValueBool); @@ -5379,14 +5529,14 @@ add_servers_configuration_response(struct json* res) { return; } - + pgagroal_json_put(server_conf, CONFIGURATION_ARGUMENT_HOST, (uintptr_t)config->servers[i].host, ValueString); pgagroal_json_put(server_conf, CONFIGURATION_ARGUMENT_PORT, (uintptr_t)config->servers[i].port, ValueInt64); pgagroal_json_put(server_conf, CONFIGURATION_ARGUMENT_TLS, (uintptr_t)config->servers[i].tls, ValueBool); pgagroal_json_put(server_conf, CONFIGURATION_ARGUMENT_TLS_CERT_FILE, (uintptr_t)config->servers[i].tls_cert_file, ValueString); pgagroal_json_put(server_conf, CONFIGURATION_ARGUMENT_TLS_KEY_FILE, (uintptr_t)config->servers[i].tls_key_file, ValueString); pgagroal_json_put(server_conf, CONFIGURATION_ARGUMENT_TLS_CA_FILE, (uintptr_t)config->servers[i].tls_ca_file, ValueString); - + pgagroal_json_put(res, config->servers[i].name, (uintptr_t)server_conf, ValueJSON); } } @@ -5672,7 +5822,7 @@ pgagroal_conf_set(SSL* ssl, int client_fd, uint8_t compression, uint8_t encrypti memcpy(config->common.tls_key_file, config_value, max); config->common.tls_key_file[max] = '\0'; pgagroal_json_put(response, key, (uintptr_t)config->common.tls_key_file, ValueString); - } + } } else if (!strcmp(key, "tls_ca_file")) { @@ -5996,16 +6146,15 @@ pgagroal_conf_set(SSL* ssl, int client_fd, uint8_t compression, uint8_t encrypti config->failover_script[max] = '\0'; pgagroal_json_put(response, key, (uintptr_t)config->failover_script, ValueString); } - else if (!strcmp(key, "libev")) + else if (!strcmp(key, "ev_backend")) { max = strlen(config_value); if (max > MISC_LENGTH - 1) { max = MISC_LENGTH - 1; } - memcpy(config->libev, config_value, max); - config->libev[max] = '\0'; - pgagroal_json_put(response, key, (uintptr_t)config->libev, ValueString); + config->ev_backend = to_backend_type(config_value); + pgagroal_json_put(response, key, (uintptr_t)to_backend_str(config->ev_backend), ValueString); } else if (!strcmp(key, "update_process_title")) { @@ -6033,8 +6182,7 @@ pgagroal_conf_set(SSL* ssl, int client_fd, uint8_t compression, uint8_t encrypti if (pgagroal_management_response_ok(NULL, client_fd, start_time, end_time, compression, encryption, payload)) { pgagroal_management_response_error(NULL, client_fd, NULL, MANAGEMENT_ERROR_CONF_SET_NETWORK, compression, encryption, payload); - pgagroal_log_error("Conf Set: Error sending response"); - goto error; + pgagroal_log_error("Conf Set: Error sending response"); goto error; } elapsed = pgagroal_get_timestamp_string(start_time, end_time, &total_seconds); @@ -6058,4 +6206,4 @@ pgagroal_conf_set(SSL* ssl, int client_fd, uint8_t compression, uint8_t encrypti exit(1); -} \ No newline at end of file +} diff --git a/src/libpgagroal/ev.c b/src/libpgagroal/ev.c new file mode 100644 index 00000000..1c6f7740 --- /dev/null +++ b/src/libpgagroal/ev.c @@ -0,0 +1,1725 @@ +/* + * Copyright (C) 2024 The pgagroal community + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may + * be used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* pgagroal */ +#include +#include +#include +#include +#include + +/* system */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_LINUX +#include +#include +#include +#include +#include +#else +#include +#include +#include +#endif /* HAVE_LINUX */ + +#define TYPEOF(watcher) watcher->io->type + +#define for_each(w, first) for (w = first; w; w = w->next) + +#define list_add(w, first) \ + do { \ + w->next = first; \ + first = w; \ + } while (0) \ + +#define list_delete(w, first, target, ret) \ + do { \ + for (w = first; *w && *w != target; w = &(*w)->next); \ + if (!(*w)) { \ + pgagroal_log_warn("%s: target watcher not found", __func__); \ + ret = EV_ERROR; \ + } else { \ + if (!target->next) { \ + *w = NULL; \ + } else { \ + *w = target->next; \ + } \ + } \ + } while (0) \ + +static int (*loop_init)(struct ev_loop*); +static int (*loop_start)(struct ev_loop*); +static int (*loop_fork)(struct ev_loop*); +static int (*loop_destroy)(struct ev_loop*); +static void (*loop_break)(struct ev_loop*); + +static int io_init(struct ev_io*, int, int, io_cb, void*, int, int); +static int (*io_start)(struct ev_loop*, struct ev_io*); +static int (*io_stop)(struct ev_loop*, struct ev_io*); + +static int (*signal_start)(struct ev_loop*, struct ev_signal*); +static int (*signal_stop)(struct ev_loop*, struct ev_signal*); + +static int (*periodic_init)(struct ev_periodic*, int); +static int (*periodic_start)(struct ev_loop*, struct ev_periodic*); +static int (*periodic_stop)(struct ev_loop*, struct ev_periodic*); + +static bool (*is_running)(struct ev_loop*); +static void (*set_running)(struct ev_loop*); + +#if HAVE_LINUX +static int __io_uring_init(struct ev_loop*); +static int __io_uring_destroy(struct ev_loop*); +static int __io_uring_handler(struct ev_loop*, struct io_uring_cqe*); +static int __io_uring_loop(struct ev_loop*); +static int __io_uring_fork(struct ev_loop*); +static int __io_uring_io_start(struct ev_loop*, struct ev_io*); +static int __io_uring_io_stop(struct ev_loop*, struct ev_io*); +static int __io_uring_setup_buffers(struct ev_loop*); +static int __io_uring_setup_more_buffers(struct ev_loop* loop); +static int __io_uring_periodic_init(struct ev_periodic*, int); +static int __io_uring_periodic_start(struct ev_loop*, struct ev_periodic*); +static int __io_uring_periodic_stop(struct ev_loop*, struct ev_periodic*); +static int __io_uring_signal_handler(struct ev_loop*, int); +static int __io_uring_signal_start(struct ev_loop*, struct ev_signal*); +static int __io_uring_signal_stop(struct ev_loop*, struct ev_signal*); +static int __io_uring_receive_handler(struct ev_loop*, struct ev_io*, struct io_uring_cqe*, void**, bool); +static int __io_uring_send_handler(struct ev_loop*, struct ev_io*, struct io_uring_cqe*); +static int __io_uring_accept_handler(struct ev_loop*, struct ev_io*, struct io_uring_cqe*); +static int __io_uring_periodic_handler(struct ev_loop*, struct ev_periodic*); + +static int __epoll_init(struct ev_loop*); +static int __epoll_destroy(struct ev_loop*); +static int __epoll_handler(struct ev_loop*, void*); +static int __epoll_loop(struct ev_loop*); +static int __epoll_fork(struct ev_loop*); +static int __epoll_io_start(struct ev_loop*, struct ev_io*); +static int __epoll_io_stop(struct ev_loop*, struct ev_io*); +static int __epoll_io_handler(struct ev_loop*, struct ev_io*); +static int __epoll_send_handler(struct ev_loop*, struct ev_io*); +static int __epoll_accept_handler(struct ev_loop*, struct ev_io*); +static int __epoll_receive_handler(struct ev_loop*, struct ev_io*); +static int __epoll_periodic_init(struct ev_periodic*, int); +static int __epoll_periodic_start(struct ev_loop*, struct ev_periodic*); +static int __epoll_periodic_stop(struct ev_loop*, struct ev_periodic*); +static int __epoll_periodic_handler(struct ev_loop*, struct ev_periodic*); +static int __epoll_signal_stop(struct ev_loop*, struct ev_signal*); +static int __epoll_signal_handler(struct ev_loop*); +static int __epoll_signal_start(struct ev_loop*, struct ev_signal*); +#else +static int __kqueue_init(struct ev_loop*); +static int __kqueue_destroy(struct ev_loop*); +static int __kqueue_handler(struct ev_loop*, struct kevent*); +static int __kqueue_loop(struct ev_loop*); +static int __kqueue_fork(struct ev_loop*); +static int __kqueue_io_start(struct ev_loop*, struct ev_io*); +static int __kqueue_io_stop(struct ev_loop*, struct ev_io*); +static int __kqueue_io_handler(struct ev_loop*, struct kevent*); +static int __kqueue_send_handler(struct ev_loop*, struct ev_io*); +static int __kqueue_accept_handler(struct ev_loop*, struct ev_io*); +static int __kqueue_receive_handler(struct ev_loop*, struct ev_io*); +static int __kqueue_periodic_init(struct ev_periodic*, int); +static int __kqueue_periodic_start(struct ev_loop*, struct ev_periodic*); +static int __kqueue_periodic_stop(struct ev_loop*, struct ev_periodic*); +static int __kqueue_periodic_handler(struct ev_loop*, struct kevent*); +static int __kqueue_signal_stop(struct ev_loop*, struct ev_signal*); +static int __kqueue_signal_handler(struct ev_loop*, struct kevent*); +static int __kqueue_signal_start(struct ev_loop*, struct ev_signal*); +#endif /* HAVE_LINUX */ + +/* context globals */ + +static bool multithreading = false; /* Enable multithreading for a loop */ + +#ifdef HAVE_LINUX +static struct io_uring_params params; /* io_uring argument params */ +static int entries; /* io_uring entries flag */ +static bool use_huge; /* io_uring use_huge flag */ +static int buf_size; /* Size of the ring-mapped buffers */ +static int buf_count; /* Number of ring-mapped buffers */ +static int br_mask; /* Buffer ring mask value */ + +static int epoll_flags; /* Flags for epoll instance creation */ +#else +static int kqueue_flags; /* Flags for kqueue instance creation */ +#endif /* HAVE_LINUX */ + +static inline bool +_is_running(struct ev_loop* loop) +{ + return loop->running; +} + +static inline bool +_is_running_atomic(struct ev_loop* loop) +{ + return atomic_load(&loop->atomic_running); +} + +static inline void +_set_running(struct ev_loop* loop) +{ + loop->running = true; +} +static inline void +_set_running_atomic(struct ev_loop* loop) +{ + atomic_store(&loop->atomic_running, true); +} + +static inline void +_break(struct ev_loop* loop) +{ + loop->running = false; +} +static inline void +_break_atomic(struct ev_loop* loop) +{ + atomic_store(&loop->atomic_running, false); +} + +static int +setup_ops(struct ev_loop* loop) +{ + struct main_configuration* config = (struct main_configuration*)shmem; + + is_running = multithreading ? _is_running_atomic : _is_running; + set_running = multithreading ? _set_running_atomic: _set_running; + loop_break = multithreading ? _break_atomic: _break; + +#if HAVE_LINUX + if (config->ev_backend == EV_BACKEND_IO_URING) + { + loop_init = __io_uring_init; + loop_fork = __io_uring_fork; + loop_destroy = __io_uring_destroy; + loop_start = __io_uring_loop; + io_start = __io_uring_io_start; + io_stop = __io_uring_io_stop; + periodic_init = __io_uring_periodic_init; + periodic_start = __io_uring_periodic_start; + periodic_stop = __io_uring_periodic_stop; + signal_start = __io_uring_signal_start; + signal_stop = __io_uring_signal_stop; + return EV_OK; + } + else if (config->ev_backend == EV_BACKEND_EPOLL) + { + loop_init = __epoll_init; + loop_fork = __epoll_fork; + loop_destroy = __epoll_destroy; + loop_start = __epoll_loop; + io_start = __epoll_io_start; + io_stop = __epoll_io_stop; + periodic_init = __epoll_periodic_init; + periodic_start = __epoll_periodic_start; + periodic_stop = __epoll_periodic_stop; + signal_start = __epoll_signal_start; + signal_stop = __epoll_signal_stop; + return EV_OK; + } +#else + if (config->ev_backend == EV_BACKEND_KQUEUE) + { + loop_init = __kqueue_init; + loop_fork = __kqueue_fork; + loop_destroy = __kqueue_destroy; + loop_start = __kqueue_loop; + io_start = __kqueue_io_start; + io_stop = __kqueue_io_stop; + periodic_init = __kqueue_periodic_init; + periodic_start = __kqueue_periodic_start; + periodic_stop = __kqueue_periodic_stop; + signal_start = __kqueue_signal_start; + signal_stop = __kqueue_signal_stop; + return EV_OK; + } +#endif /* HAVE_LINUX */ + return EV_ERROR; +} + +/* This function is used exclusively by the parent process to handle + * SIGCHLD and avoid defunct processes */ +static void +sigchld_handler(struct ev_loop* loop, struct ev_signal* w, int sig) +{ +#if DEBUG + int status; + pid_t pid; + while ((pid = waitpid(-1, &status, WNOHANG)) > 0) + { + if (WIFEXITED(status)) + { + pgagroal_log_debug("Child %d exited with status %d", pid, WEXITSTATUS(status)); + } + else if (WIFSIGNALED(status)) + { + pgagroal_log_debug("Child %d terminated by signal %d", pid, WTERMSIG(status)); + } + else + { + pgagroal_log_debug("Child %d terminated unexpectedly", pid); + } + } + if (pid == -1 && errno != ECHILD) + { + pgagroal_log_error("%s: waitpid: %s", __func__, strerror(errno)); + } +#else + while (waitpid(-1, NULL, WNOHANG) > 0) + ; +#endif +} + +struct ev_loop* +pgagroal_ev_init(void) +{ + struct ev_loop* loop; + static ev_signal w = { + .type = EV_SIGNAL, + .signum = SIGCHLD, + .cb = sigchld_handler, + .next = NULL, + }; + + static bool context_is_set = false; + + loop = calloc(1, sizeof(struct ev_loop)); + sigemptyset(&loop->sigset); + + if (!context_is_set) + { + +#if HAVE_LINUX + /* io_uring context */ + entries = 32; + params.cq_entries = 64; + params.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_CLAMP | IORING_SETUP_CQSIZE; + buf_count = BUFFER_COUNT; + buf_size = DEFAULT_BUFFER_SIZE; + br_mask = (buf_count - 1); + + /* epoll context */ + epoll_flags = 0; +#else + /* kqueue context */ + kqueue_flags = 0; +#endif /* HAVE_LINUX */ + + if (setup_ops(loop)) + { + pgagroal_log_fatal("pgagroal: Failed to event backend operations"); + goto error; + } + + if (loop_init(loop)) + { + pgagroal_log_fatal("pgagroal: Failed to initiate loop"); + goto error; + } + + /* handle with SIGCHLD if the main_loop */ + pgagroal_ev_signal_start(loop, &w); + + context_is_set = true; + } + else if (loop_init(loop)) + { + pgagroal_log_fatal("pgagroal: Failed to initiate loop"); + goto error; + } + + return loop; + +error: + free(loop); + return NULL; +} + +int +pgagroal_ev_loop(struct ev_loop* loop) +{ + return loop_start(loop); +} + +int +pgagroal_ev_fork(struct ev_loop* loop) +{ + if (sigprocmask(SIG_UNBLOCK, &loop->sigset, NULL) == -1) + { + pgagroal_log_fatal("sigprocmask"); + exit(1); + } + /* no need to empty sigset */ + return loop_fork(loop); +} + +int +pgagroal_ev_loop_destroy(struct ev_loop* loop) +{ + int ret; + if (!loop) + { + return EV_OK; + } + ret = loop_destroy(loop); + free(loop); + return ret; +} + +void +pgagroal_ev_loop_break(struct ev_loop* loop) +{ + loop_break(loop); +} + +bool +pgagroal_ev_loop_is_running(struct ev_loop* loop) +{ + return loop->running; +} + +bool +pgagroal_ev_atomic_loop_is_running(struct ev_loop* loop) +{ + return atomic_load(&loop->atomic_running); +} + +int +pgagroal_ev_io_accept_init(struct ev_io* w, int fd, io_cb cb) +{ + return io_init(w, fd, EV_ACCEPT, cb, NULL, 0, -1); +} + +int +pgagroal_ev_io_send_init(struct ev_io* w, int fd, io_cb cb, void* buf, int buf_len) +{ + return io_init(w, fd, EV_SEND, cb, buf, buf_len, -1); +} + +int +pgagroal_ev_io_receive_init(struct ev_io* w, int fd, io_cb cb) +{ + return io_init(w, fd, EV_RECEIVE, cb, NULL, 0, -1); +} + +int +pgagroal_ev_io_start(struct ev_loop* loop, struct ev_io* w) +{ + list_add(w, loop->ihead.next); + return io_start(loop, w); +} + +int +pgagroal_ev_io_stop(struct ev_loop* loop, struct ev_io* target) +{ + int ret = EV_OK; + struct ev_io** w; + if (!loop) + { + pgagroal_log_debug("loop is NULL"); + return EV_ERROR; + } + if (!target) + { + pgagroal_log_fatal("target is NULL"); + return EV_ERROR; + } + io_stop(loop, target); + list_delete(w, &loop->ihead.next, target, ret); + return ret; +} + +int +pgagroal_ev_signal_init(struct ev_signal* w, signal_cb cb, int signum) +{ + w->type = EV_SIGNAL; + w->signum = signum; + w->cb = cb; + w->next = NULL; + return EV_OK; +} + +int +pgagroal_ev_signal_start(struct ev_loop* loop, struct ev_signal* w) +{ + sigaddset(&loop->sigset, w->signum); + if (sigprocmask(SIG_BLOCK, &loop->sigset, NULL) == -1) + { + pgagroal_log_fatal("sigprocmask"); + exit(1); + } + signal_start(loop, w); + list_add(w, loop->shead.next); + return EV_OK; +} + +int __attribute__ ((unused)) +pgagroal_ev_signal_stop(struct ev_loop* loop, struct ev_signal* target) +{ + int ret = EV_OK; + sigset_t tmp; + struct ev_signal** w; + + if (!target) + { + pgagroal_log_fatal("target is NULL"); + exit(1); + } + + sigemptyset(&tmp); + sigaddset(&tmp, target->signum); +#if !HAVE_LINUX + /* TODO FreeBSD catches SIGINT as soon as it is removed from + * sigset. This should be handled in a better way. + * This is left here as a way to "fix" the issue. + */ + if (target->signum != SIGINT) + { +#endif + if (sigprocmask(SIG_UNBLOCK, &tmp, NULL) == -1) + { + pgagroal_log_error("sigprocmask"); + exit(1); + } +#if !HAVE_LINUX +} +#endif + + signal_stop(loop, target); + + list_delete(w, &loop->shead.next, target, ret); + + return ret; +} + +int +pgagroal_ev_periodic_init(struct ev_periodic* w, periodic_cb cb, int msec) +{ + if (periodic_init(w, msec)) + { + pgagroal_log_fatal("periodic_init"); + exit(1); + } + w->type = EV_PERIODIC; + w->cb = cb; + w->next = NULL; + return EV_OK; +} + +int +pgagroal_ev_periodic_start(struct ev_loop* loop, struct ev_periodic* w) +{ + periodic_start(loop, w); + list_add(w, loop->phead.next); + return EV_OK; +} + +int __attribute__((unused)) +pgagroal_ev_periodic_stop(struct ev_loop* loop, struct ev_periodic* target) +{ + int ret; + struct ev_periodic** w; + if (!target) + { + pgagroal_log_error("null pointer provided to stop"); + return EV_ERROR; + } + ret = periodic_stop(loop, target); + list_delete(w, &loop->phead.next, target, ret); + return ret; +} + +static int +io_init(struct ev_io* w, int fd, int event, io_cb cb, void* data, int size, int rsvd) +{ + w->type = event; + w->fd = fd; + w->cb = cb; + w->data = data; + w->size = size; + return EV_OK; +} + +#if HAVE_LINUX +static inline struct io_uring_sqe* +__io_uring_get_sqe(struct ev_loop* loop) +{ + struct io_uring* ring = &loop->ring; + struct io_uring_sqe* sqe; + /* this loop is necessary if SQPOLL is being used */ + do + { + sqe = io_uring_get_sqe(ring); + if (sqe) + { + return sqe; + } + else + { + io_uring_sqring_wait(ring); + } + } + while (1); +} + +static inline void __attribute__((unused)) +__io_uring_rearm_receive(struct ev_loop* loop, struct ev_io* w) +{ + struct io_uring_sqe* sqe = __io_uring_get_sqe(loop); + io_uring_sqe_set_data(sqe, w); + io_uring_prep_recv_multishot(sqe, w->fd, NULL, 0, 0); + sqe->flags |= IOSQE_BUFFER_SELECT; + sqe->buf_group = 0; +} + +static inline int __attribute__((unused)) +__io_uring_replenish_buffers(struct ev_loop* loop, struct io_buf_ring* br, int bid_start, int bid_end) +{ + int count; + if (bid_end >= bid_start) + { + count = (bid_end - bid_start); + } + else + { + count = (bid_end + buf_count - bid_start); + } + for (int i = bid_start; i != bid_end; i = (i + 1) & (buf_count - 1)) + { + io_uring_buf_ring_add(br->br, (void*)br->br->bufs[i].addr, buf_size, i, br_mask, 0); + } + io_uring_buf_ring_advance(br->br, count); + return EV_OK; +} + +static int +__io_uring_init(struct ev_loop* loop) +{ + int ret; + ret = io_uring_queue_init_params(entries, &loop->ring, ¶ms); + if (ret) + { + pgagroal_log_fatal("io_uring_queue_init_params: %s", strerror(-ret)); + exit(1); + } + ret = __io_uring_setup_buffers(loop); + if (ret) + { + pgagroal_log_fatal("__io_uring_setup_buffers error: %s", strerror(-ret)); + exit(1); + } + ret = io_uring_ring_dontfork(&loop->ring); + if (ret) + { + pgagroal_log_fatal("error on io_uring_ring_dontfork: %s", strerror(-ret)); + exit(1); + } + return EV_OK; +} + +static int +__io_uring_destroy(struct ev_loop* loop) +{ + const int bgid = 0; /* const for now */ + struct io_buf_ring* br = &loop->br; + if (io_uring_free_buf_ring(&loop->ring, br->br, buf_count, bgid)) + { + pgagroal_log_fatal("ev: io_uring_free_buf_ring (%s)", strerror(errno)); + exit(1); + } + free(br->buf); + io_uring_queue_exit(&loop->ring); + return EV_OK; +} + +static int +__io_uring_io_start(struct ev_loop* loop, struct ev_io* w) +{ + struct io_uring_sqe* sqe = __io_uring_get_sqe(loop); + io_uring_sqe_set_data(sqe, w); + switch (w->type) + { + case EV_ACCEPT: + io_uring_prep_multishot_accept(sqe, w->fd, NULL, NULL, 0); + break; + case EV_RECEIVE: + io_uring_prep_recv(sqe, w->fd, loop->br.buf, buf_size, 0); + if (0) + { + sqe->ioprio |= IORING_RECV_MULTISHOT; + } + + sqe->flags |= IOSQE_BUFFER_SELECT; + sqe->buf_group = 0; + break; + case EV_SEND: + io_uring_prep_send(sqe, w->fd, w->data, w->size, MSG_WAITALL | MSG_NOSIGNAL); /* TODO: flags */ + break; + default: + pgagroal_log_fatal("unknown event type: %d", w->type); + exit(1); + } + return EV_OK; +} + +static int +__io_uring_io_stop(struct ev_loop* loop, struct ev_io* target) +{ + int ret = EV_OK; + struct io_uring_sqe* sqe; + /* NOTE: When io_stop is called it may never return to a loop + * where sqes are submitted. Flush these sqes so the get call + * doesn't return NULL. */ + do + { + sqe = io_uring_get_sqe(&loop->ring); + if (sqe) + { + break; + } + io_uring_submit(&loop->ring); + } + while (1); + io_uring_prep_cancel64(sqe, (uint64_t)target, 0); /* TODO: flags? */ + return ret; +} + +static int +__io_uring_signal_start(struct ev_loop* loop, struct ev_signal* w) +{ + return EV_OK; +} + +static int +__io_uring_signal_stop(struct ev_loop* loop, struct ev_signal* w) +{ + return EV_OK; +} + +static int +__io_uring_periodic_init(struct ev_periodic* w, int msec) +{ + w->ts = (struct __kernel_timespec) { + .tv_sec = msec / 1000, + .tv_nsec = (msec % 1000) * 1000000 + }; + return EV_OK; +} + +static int +__io_uring_periodic_start(struct ev_loop* loop, struct ev_periodic* w) +{ + struct io_uring_sqe* sqe = io_uring_get_sqe(&loop->ring); + io_uring_sqe_set_data(sqe, w); + io_uring_prep_timeout(sqe, &w->ts, 0, IORING_TIMEOUT_MULTISHOT); + return EV_OK; +} + +static int +__io_uring_periodic_stop(struct ev_loop* loop, struct ev_periodic* w) +{ + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(&loop->ring); + io_uring_prep_cancel64(sqe, (uint64_t)w, 0); /* TODO: flags? */ + return EV_OK; +} + +/* + * Based on: https://git.kernel.dk/cgit/liburing/tree/examples/proxy.c + * (C) 2024 Jens Axboe + */ +static int +__io_uring_loop(struct ev_loop* loop) +{ + int ret; + int signum; + int events; + int to_wait = 1; /* wait for any 1 */ + unsigned int head; + struct io_uring_cqe* cqe = NULL; + struct __kernel_timespec* ts = NULL; + struct __kernel_timespec idle_ts = { + .tv_sec = 0, + .tv_nsec = 10000000LL + }; + struct timespec timeout = { + .tv_sec = 0, + .tv_nsec = 0, + }; + + set_running(loop); + while (is_running(loop)) + { + ts = &idle_ts; + io_uring_submit_and_wait_timeout(&loop->ring, &cqe, to_wait, ts, NULL); + + /* Good idea to leave here to see what happens */ + if (*loop->ring.cq.koverflow) + { + pgagroal_log_error("io_uring overflow %u", *loop->ring.cq.koverflow); + exit(EXIT_FAILURE); + } + if (*loop->ring.sq.kflags & IORING_SQ_CQ_OVERFLOW) + { + pgagroal_log_error("io_uring overflow"); + exit(EXIT_FAILURE); + } + + /* Check for signals before iterating over cqes */ + signum = sigtimedwait(&loop->sigset, NULL, &timeout); + if (signum > 0) + { + ret = __io_uring_signal_handler(loop, signum); + + if (ret == EV_ERROR) + { + pgagroal_log_error("Signal handling error"); + return EV_ERROR; + } + if (!is_running(loop)) + { + break; + } + } + + events = 0; + io_uring_for_each_cqe(&(loop->ring), head, cqe) + { + ret = __io_uring_handler(loop, cqe); + events++; + } + if (events) + { + io_uring_cq_advance(&loop->ring, events); /* batch marking as seen */ + } + + /* TODO: housekeeping ? */ + + } + return ret; +} + +static int +__io_uring_fork(struct ev_loop* loop) +{ + return EV_OK; +} + +static int +__io_uring_handler(struct ev_loop* loop, struct io_uring_cqe* cqe) +{ + int ret = EV_OK; + ev_watcher w; + w.io = (ev_io*)io_uring_cqe_get_data(cqe); + + void* buf; + + /* + * Cancelled requests will trigger the handler, but have NULL data. + */ + if (!w.io) + { + return EV_OK; + } + + /* io handler */ + switch (w.io->type) + { + case EV_PERIODIC: + return __io_uring_periodic_handler(loop, w.periodic); + case EV_ACCEPT: + return __io_uring_accept_handler(loop, w.io, cqe); + case EV_SEND: + return __io_uring_send_handler(loop, w.io, cqe); + case EV_RECEIVE: +retry: + ret = __io_uring_receive_handler(loop, w.io, cqe, &buf, false); + switch (ret) + { + case EV_CONNECTION_CLOSED: /* connection closed */ + break; + case EV_ERROR: + break; + case EV_REPLENISH_BUFFERS: + if (__io_uring_setup_more_buffers(loop)) + { + return EV_ERROR; + } + goto retry; + break; + } + break; + default: + pgagroal_log_fatal("unknown event type: %d", w.io->type); + exit(1); + } + return ret; +} + +static int +__io_uring_periodic_handler(struct ev_loop* loop, struct ev_periodic* w) +{ + w->cb(loop, w, 0); + return EV_OK; +} + +static int +__io_uring_accept_handler(struct ev_loop* loop, struct ev_io* w, struct io_uring_cqe* cqe) +{ + w->client_fd = cqe->res; + w->cb(loop, w, EV_OK); + return EV_OK; +} + +static int +__io_uring_send_handler(struct ev_loop* loop, struct ev_io* w, struct io_uring_cqe* cqe) +{ + struct io_buf_ring* br = &loop->br; + const int bid = 0; + const int cnt = 1; + + io_uring_buf_ring_add(br->br, (void*) br->br->bufs[bid].addr, buf_size, 0, br_mask, bid); + io_uring_buf_ring_advance(br->br, cnt); + + struct io_uring_sqe* sqe = __io_uring_get_sqe(loop); + io_uring_sqe_set_data(sqe, w); + io_uring_prep_recv(sqe, w->fd, NULL, 0, 0); + sqe->flags |= IOSQE_BUFFER_SELECT | MSG_WAITALL; + return EV_OK; +} + +static int +__io_uring_signal_handler(struct ev_loop* loop, int signo) +{ + struct ev_signal* w; + for_each(w, loop->shead.next) + { + if (w->signum == signo) + { + w->cb(loop, w, 0); + return EV_OK; + } + } + return EV_ERROR; +} + +static int +__io_uring_receive_handler(struct ev_loop* loop, struct ev_io* w, struct io_uring_cqe* cqe, void** _unused, bool __unused) +{ + int bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT; + struct io_buf_ring* br = &loop->br; + int total_in_bytes = cqe->res; + int cnt = 1; + + if (cqe->res == -ENOBUFS) + { + pgagroal_log_warn("ev: Not enough buffers"); + exit(1); + /* TODO return EV_REPLENISH_BUFFERS; */ + } + + if (!(cqe->flags & IORING_CQE_F_BUFFER) && !(cqe->res)) + { + pgagroal_log_debug("ev: Connection closed"); + w->data = NULL; + w->size = 0; + w->cb(loop, w, EV_OK); + return EV_OK; + } + + w->data = br->buf + (bid * buf_size); + w->size = total_in_bytes; + w->cb(loop, w, EV_OK); + + // struct io_uring_sqe* sqe = __io_uring_get_sqe(loop); + // io_uring_sqe_set_data(sqe, w); + // io_uring_prep_send(sqe, w->sendto->fd, w->data, w->size, 0); + + io_uring_buf_ring_add(br->br, w->data, buf_size, bid, br_mask, bid); + io_uring_buf_ring_advance(br->br, cnt); + + __io_uring_io_start(loop, w); + + return EV_OK; +} + +static int __attribute__((unused)) +__io_uring_receive_multishot_handler(struct ev_loop* loop, struct ev_io* w, struct io_uring_cqe* cqe, void** unused, bool is_proxy) +{ + struct io_buf_ring* br = &loop->br; + int bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT; + int total_in_bytes = cqe->res; + int cnt = 1; + + if (cqe->res == -ENOBUFS) + { + pgagroal_log_warn("ev: Not enough buffers"); + return EV_REPLENISH_BUFFERS; + } + + if (!(cqe->flags & IORING_CQE_F_BUFFER) && !(cqe->res)) + { + pgagroal_log_debug("ev: Connection closed"); + return EV_CONNECTION_CLOSED; + } + else if (!(cqe->flags & IORING_CQE_F_MORE)) + { + /* do not rearm receive. In fact, disarm anything so pgagroal can deal with + * read / write from sockets + */ + pgagroal_log_debug("ev: Transaction likely cancelled"); + w->data = NULL; + w->size = 0; + w->cb(loop, w, EV_ERROR); + return EV_CONNECTION_CLOSED; + } + + w->data = br->buf + (bid * buf_size); + w->size = total_in_bytes; + w->cb(loop, w, EV_OK); + io_uring_buf_ring_add(br->br, w->data, buf_size, bid, br_mask, bid); + io_uring_buf_ring_advance(br->br, cnt); + + return EV_OK; +} + +static int +__io_uring_setup_buffers(struct ev_loop* loop) +{ + int ret; + int br_bgid = 0; + int br_flags = 0; + void* ptr; + + struct io_buf_ring* br = &loop->br; + if (use_huge) + { + pgagroal_log_fatal("io_uring use_huge not implemented yet"); + exit(1); + } + if (posix_memalign(&br->buf, ALIGNMENT, buf_count * buf_size)) + { + pgagroal_log_fatal("posix_memalign"); + exit(1); + } + + br->br = io_uring_setup_buf_ring(&loop->ring, buf_count, br_bgid, br_flags, &ret); + if (!br->br) + { + pgagroal_log_fatal("buffer ring register failed %d", strerror(-ret)); + exit(1); + } + + ptr = br->buf; + for (int i = 0; i < buf_count; i++) + { + io_uring_buf_ring_add(br->br, ptr, buf_size, i, br_mask, i); + ptr += buf_size; + } + io_uring_buf_ring_advance(br->br, buf_count); + + return EV_OK; +} + +static int +__io_uring_setup_more_buffers(struct ev_loop* loop) +{ + int ret = EV_OK; + int br_bgid = 0; + int br_flags = 0; + void* ptr; + exit(1); + + struct io_buf_ring* br = &loop->br; + if (use_huge) + { + pgagroal_log_fatal("io_uring use_huge not implemented yet"); + exit(1); + } + if (posix_memalign(&br->buf, ALIGNMENT, buf_count * buf_size)) + { + pgagroal_log_fatal("posix_memalign"); + exit(1); + } + + br->br = io_uring_setup_buf_ring(&loop->ring, buf_count, br_bgid, br_flags, &ret); + if (!br->br) + { + pgagroal_log_fatal("buffer ring register failed %d", strerror(-ret)); + exit(1); + } + + ptr = br->buf; + for (int i = 0; i < buf_count; i++) + { + io_uring_buf_ring_add(br->br, ptr, buf_size, i, br_mask, i); + ptr += buf_size; + } + io_uring_buf_ring_advance(br->br, buf_count); + + return EV_OK; +} + +void +_next_bid(struct ev_loop* loop, int* bid) +{ + *bid = (*bid + 1) % buf_count; +} + +int +__epoll_loop(struct ev_loop* loop) +{ + int ret = EV_OK; + int nfds; + struct epoll_event events[MAX_EVENTS]; +#if HAVE_EPOLL_PWAIT2 + struct timespec timeout_ts = { + .tv_sec = 0, + .tv_nsec = 10000000LL, + }; +#else + int timeout = 10000LL; /* ms */ +#endif + struct epoll_event ev = { + .events = EPOLLIN | EPOLLET, + .data.fd = signalfd(-1, &loop->sigset, 0), + }; + if (ev.data.fd == -1) + { + pgagroal_log_fatal("signalfd"); + exit(1); + } + if (epoll_ctl(loop->epollfd, EPOLL_CTL_ADD, ev.data.fd, &ev) == -1) + { + pgagroal_log_fatal("ev: epoll_ctl (%s)", strerror(errno)); + exit(1); + } + + set_running(loop); + while (is_running(loop)) + { +#if HAVE_EPOLL_PWAIT2 + nfds = epoll_pwait2(loop->epollfd, events, MAX_EVENTS, &timeout_ts, &loop->sigset); +#else + nfds = epoll_pwait(loop->epollfd, events, MAX_EVENTS, timeout, &loop->sigset); +#endif + + for (int i = 0; i < nfds; i++) + { + if (events[i].data.fd == ev.data.fd) + { + ret = __epoll_signal_handler(loop); + } + else + { + ret = __epoll_handler(loop, (void*)events[i].data.u64); + } + } + } + return ret; +} + +static int +__epoll_init(struct ev_loop* loop) +{ + loop->epollfd = epoll_create1(epoll_flags); + if (loop->epollfd == -1) + { + pgagroal_log_fatal("epoll_init"); + exit(1); + } + return EV_OK; +} + +static int +__epoll_fork(struct ev_loop* loop) +{ + + close(loop->epollfd); + return EV_OK; +} + +static int +__epoll_destroy(struct ev_loop* loop) +{ + close(loop->epollfd); + return EV_OK; +} + +static int +__epoll_handler(struct ev_loop* loop, void* wp) +{ + struct ev_periodic* w = (struct ev_periodic*)wp; + if (w->type == EV_PERIODIC) + { + return __epoll_periodic_handler(loop, (struct ev_periodic*)w); + } + return __epoll_io_handler(loop, (struct ev_io*)w); +} + +static int +__epoll_signal_start(struct ev_loop* loop, struct ev_signal* w) +{ + return EV_OK; +} + +static int +__epoll_signal_stop(struct ev_loop* loop, struct ev_signal* w) +{ + return EV_OK; +} + +static int +__epoll_signal_handler(struct ev_loop* loop) +{ + struct ev_signal* w; + siginfo_t siginfo; + int signo; + signo = sigwaitinfo(&loop->sigset, &siginfo); + if (signo == -1) + { + pgagroal_log_error("sigwaitinfo"); + return EV_ERROR; + } + + for_each(w, loop->shead.next) + { + if (w->signum == signo) + { + w->cb(loop, w, 0); + return EV_OK; + } + } + + pgagroal_log_error("No handler found for signal %d", signo); + return EV_ERROR; +} + +static int +__epoll_periodic_init(struct ev_periodic* w, int msec) +{ + struct timespec now; + struct itimerspec new_value; + + if (clock_gettime(CLOCK_MONOTONIC, &now) == -1) + { + pgagroal_log_error("clock_gettime"); + return EV_ERROR; + } + + new_value.it_value.tv_sec = msec / 1000; + new_value.it_value.tv_nsec = (msec % 1000) * 1000000; + + new_value.it_interval.tv_sec = msec / 1000; + new_value.it_interval.tv_nsec = (msec % 1000) * 1000000; + + w->fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); /* no need to set it to non-blocking due to TFD_NONBLOCK */ + if (w->fd == -1) + { + pgagroal_log_error("timerfd_create"); + return EV_ERROR; + } + + if (timerfd_settime(w->fd, 0, &new_value, NULL) == -1) + { + pgagroal_log_error("timerfd_settime"); + close(w->fd); + return EV_ERROR; + } + return EV_OK; +} + +static int +__epoll_periodic_start(struct ev_loop* loop, struct ev_periodic* w) +{ + struct epoll_event event; + event.events = EPOLLIN; + event.data.u64 = (uint64_t)w; + if (epoll_ctl(loop->epollfd, EPOLL_CTL_ADD, w->fd, &event) == -1) + { + pgagroal_log_fatal("ev: epoll_ctl (%s)", strerror(errno)); + exit(1); + } + return EV_OK; +} + +static int +__epoll_periodic_stop(struct ev_loop* loop, struct ev_periodic* w) +{ + if (epoll_ctl(loop->epollfd, EPOLL_CTL_DEL, w->fd, NULL) == -1) + { + pgagroal_log_fatal("%s: epoll_ctl (%s)", strerror(errno)); + exit(1); + } + return EV_OK; +} + +static int +__epoll_periodic_handler(struct ev_loop* loop, struct ev_periodic* w) +{ + uint64_t exp; + int nread = read(w->fd, &exp, sizeof(uint64_t)); + if (nread != sizeof(uint64_t)) + { + pgagroal_log_error("periodic_handler: read"); + return EV_ERROR; + } + w->cb(loop, w, 0); + return EV_OK; +} + +static int +__epoll_io_start(struct ev_loop* loop, struct ev_io* w) +{ + struct epoll_event event; + + event.data.u64 = (uintptr_t)w; + + switch (w->type) + { + case EV_ACCEPT: + event.events = EPOLLIN; + break; + case EV_RECEIVE: + pgagroal_socket_nonblocking(w->fd, true); + event.events = EPOLLIN; /* TODO: | EPOLLET; */ + break; + case EV_SEND: + event.events = EPOLLOUT; + break; + default: + pgagroal_log_fatal("unknown event type: %d", w->type); + exit(1); + } + + if (epoll_ctl(loop->epollfd, EPOLL_CTL_ADD, w->fd, &event) == -1) + { + pgagroal_log_fatal("ev: epoll_ctl (%s)", strerror(errno)); + exit(1); + } + + return EV_OK; +} + +static int +__epoll_io_stop(struct ev_loop* ev, struct ev_io* target) +{ + if (epoll_ctl(ev->epollfd, EPOLL_CTL_DEL, target->fd, NULL) == -1) + { + if (errno == EBADF || errno == ENOENT || errno == EINVAL) + { + pgagroal_log_debug("ev: epoll_ctl failed (%s)", strerror(errno)); + } + else + { + pgagroal_log_fatal("ev: epoll_ctl (%s)", strerror(errno)); + exit(1); + } + } + return EV_OK; +} + +static int +__epoll_io_handler(struct ev_loop* loop, struct ev_io* w) +{ + switch (w->type) + { + case EV_ACCEPT: + return __epoll_accept_handler(loop, w); + case EV_SEND: + return __epoll_send_handler(loop, w); + case EV_RECEIVE: + return __epoll_receive_handler(loop, w); + default: + pgagroal_log_fatal("unknown event type: %d", w->type); + exit(1); + } +} + +static int +__epoll_accept_handler(struct ev_loop* loop, struct ev_io* w) +{ + int ret = EV_OK; + int listen_fd = w->fd; + int client_fd; + + client_fd = accept(listen_fd, NULL, NULL); + if (client_fd == -1) + { + if (!(errno == EAGAIN) && !(errno == EWOULDBLOCK)) + { + ret = EV_ERROR; + } + } + else + { + pgagroal_socket_nonblocking(client_fd, true); + w->client_fd = client_fd; + w->cb(loop, w, ret); + } + + return ret; +} + +static int +__epoll_receive_handler(struct ev_loop* loop, struct ev_io* w) +{ + int ret = EV_OK; + w->cb(loop, w, ret); + return ret; +} + +static int +__epoll_send_handler(struct ev_loop* loop, struct ev_io* w) +{ + int ret = EV_OK; + w->cb(loop, w, ret); + return ret; +} + +#else + +int +__kqueue_loop(struct ev_loop* ev) +{ + int ret = EV_OK; + int nfds; + struct kevent events[MAX_EVENTS]; + struct timespec timeout; + timeout.tv_sec = 0; + timeout.tv_nsec = 10000000; /* 10 ms */ + + set_running(ev); + do + { + nfds = kevent(ev->kqueuefd, NULL, 0, events, MAX_EVENTS, &timeout); + if (nfds == -1) + { + if (errno == EINTR) + { + continue; + } + pgagroal_log_error("kevent"); + ret = EV_ERROR; + loop_break(ev); + break; + } + for (int i = 0; i < nfds; i++) + { + ret = __kqueue_handler(ev, &events[i]); + } + } + while (is_running(ev)); + return ret; +} + +static int +__kqueue_init(struct ev_loop* ev) +{ + ev->kqueuefd = kqueue(); + if (ev->kqueuefd == -1) + { + perror("kqueue"); + pgagroal_log_fatal("kqueue init error"); + exit(1); + } + return EV_OK; +} + +static int +__kqueue_fork(struct ev_loop* loop) +{ + close(loop->kqueuefd); + return EV_OK; +} + +static int +__kqueue_destroy(struct ev_loop* loop) +{ + close(loop->kqueuefd); + return EV_OK; +} + +static int +__kqueue_handler(struct ev_loop* ev, struct kevent* kev) +{ + switch (kev->filter) + { + case EVFILT_TIMER: + return __kqueue_periodic_handler(ev, kev); + case EVFILT_SIGNAL: + return __kqueue_signal_handler(ev, kev); + case EVFILT_READ: + case EVFILT_WRITE: + return __kqueue_io_handler(ev, kev); + default: + pgagroal_log_fatal("ev: Unknown filter in handler"); + exit(1); + } +} + +static int +__kqueue_signal_start(struct ev_loop* loop, struct ev_signal* w) +{ + struct kevent kev; + + pgagroal_log_debug("ev: starting signal %d", w->signum); + EV_SET(&kev, w->signum, EVFILT_SIGNAL, EV_ADD, 0, 0, w); + if (kevent(loop->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_fatal("ev: kevent (%s)", strerror(errno)); + exit(1); + } + return EV_OK; +} + +static int +__kqueue_signal_stop(struct ev_loop* ev, struct ev_signal* w) +{ + struct kevent kev; + + EV_SET(&kev, w->signum, EVFILT_SIGNAL, EV_DELETE, 0, 0, w); + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_fatal("ev: kevent (%s)", strerror(errno)); + exit(1); + } + return EV_OK; +} + +static int +__kqueue_signal_handler(struct ev_loop* ev, struct kevent* kev) +{ + struct ev_signal* w = (struct ev_signal*)kev->udata; + + if (w->signum == (int)kev->ident) + { + w->cb(ev, w, 0); + return EV_OK; + } + else + { + pgagroal_log_error("No handler found for signal %d", (int)kev->ident); + return EV_ERROR; + } +} + +static int +__kqueue_periodic_init(struct ev_periodic* w, int msec) +{ + w->interval = msec; + return EV_OK; +} + +static int +__kqueue_periodic_start(struct ev_loop* ev, struct ev_periodic* w) +{ + struct kevent kev; + EV_SET(&kev, (uintptr_t)w, EVFILT_TIMER, EV_ADD | EV_ENABLE, NOTE_USECONDS, w->interval * 1000, w); + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("kevent: timer add"); + return EV_ERROR; + } + return EV_OK; +} + +static int +__kqueue_periodic_stop(struct ev_loop* ev, struct ev_periodic* w) +{ + struct kevent kev; + EV_SET(&kev, (uintptr_t)w, EVFILT_TIMER, EV_DELETE, 0, 0, NULL); + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("kevent: timer delete"); + return EV_ERROR; + } + + return EV_OK; +} + +static int +__kqueue_periodic_handler(struct ev_loop* ev, struct kevent* kev) +{ + struct ev_periodic* w = (struct ev_periodic*)kev->udata; + pgagroal_log_debug("%s"); + w->cb(ev, w, 0); + return EV_OK; +} + +static int +__kqueue_io_start(struct ev_loop* ev, struct ev_io* w) +{ + struct kevent kev; + int filter; + + switch (w->type) + { + case EV_ACCEPT: + case EV_RECEIVE: + filter = EVFILT_READ; + break; + case EV_SEND: + filter = EVFILT_WRITE; + break; + default: + pgagroal_log_fatal("unknown event type: %d", w->type); + exit(1); + } + + pgagroal_socket_nonblocking(w->fd, true); + + EV_SET(&kev, w->fd, filter, EV_ADD | EV_ENABLE | EV_CLEAR, 0, 0, w); + + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("%s: kevent add failed", __func__); + return EV_ERROR; + } + + return EV_OK; +} + +static int +__kqueue_io_stop(struct ev_loop* ev, struct ev_io* w) +{ + struct kevent kev; + int filter; + + switch (w->type) + { + case EV_ACCEPT: + case EV_RECEIVE: + filter = EVFILT_READ; + break; + case EV_SEND: + filter = EVFILT_WRITE; + break; + default: + pgagroal_log_fatal("unknown event type: %d", w->type); + exit(1); + } + + EV_SET(&kev, w->fd, filter, EV_DELETE, 0, 0, NULL); + + if (kevent(ev->kqueuefd, &kev, 1, NULL, 0, NULL) == -1) + { + pgagroal_log_error("%s: kevent delete failed", __func__); + return EV_ERROR; + } + + return EV_OK; +} + +static int +__kqueue_io_handler(struct ev_loop* ev, struct kevent* kev) +{ + struct ev_io* w = (struct ev_io*)kev->udata; + int ret = EV_OK; + + switch (w->type) + { + case EV_ACCEPT: + ret = __kqueue_accept_handler(ev, w); + break; + case EV_SEND: + ret = __kqueue_send_handler(ev, w); + break; + case EV_RECEIVE: + ret = __kqueue_receive_handler(ev, w); + break; + default: + pgagroal_log_fatal("unknown event type: %d", w->type); + exit(1); + } + + return ret; +} + +static int +__kqueue_receive_handler(struct ev_loop* loop, struct ev_io* w) +{ + int ret = EV_OK; + w->cb(loop, w, ret); + return ret; +} + +static int +__kqueue_send_handler(struct ev_loop* loop, struct ev_io* w) +{ + int ret = EV_OK; + w->cb(loop, w, ret); + return ret; +} + +static int +__kqueue_accept_handler(struct ev_loop* ev, struct ev_io* w) +{ + int ret = EV_OK; + int listen_fd = w->fd; + + while (1) + { + w->client_fd = accept(listen_fd, NULL, NULL); + if (w->client_fd == -1) + { + if (errno == EAGAIN || errno == EWOULDBLOCK) + { + ret = EV_OK; + break; + } + else + { + pgagroal_log_error("accept_handler: accept"); + ret = EV_ERROR; + break; + } + } + else + { + w->cb(ev, w, ret); + } + } + + return ret; +} + +#endif /* HAVE_LINUX */ diff --git a/src/libpgagroal/message.c b/src/libpgagroal/message.c index fb8f2b14..d0cc1ba3 100644 --- a/src/libpgagroal/message.c +++ b/src/libpgagroal/message.c @@ -42,6 +42,7 @@ #include #include +static int read_message_from_buffer(void* data, ssize_t size, struct message** msg); static int read_message(int socket, bool block, int timeout, struct message** msg); static int write_message(int socket, struct message* msg); @@ -87,6 +88,12 @@ pgagroal_read_socket_message(int socket, struct message** msg) return read_message(socket, false, 0, msg); } +int +pgagroal_buffer_to_message(void* data, ssize_t size, struct message** msg) +{ + return read_message_from_buffer(data, size, msg); +} + int pgagroal_write_socket_message(int socket, struct message* msg) { @@ -1205,6 +1212,28 @@ pgagroal_log_message(struct message* msg) } } +static int +read_message_from_buffer(void* data, ssize_t size, struct message** msg) +{ + struct message* m = NULL; + + if (data == NULL || size <= 0) + { + pgagroal_log_error("read_message_from_buffer: bad buffer"); + return MESSAGE_STATUS_ERROR; + } + + m = pgagroal_memory_message(); + + m->data = data; + m->length = size; + + m->kind = (signed char)(*((char*)m->data)); + *msg = m; + + return MESSAGE_STATUS_OK; +} + static int read_message(int socket, bool block, int timeout, struct message** msg) { diff --git a/src/libpgagroal/network.c b/src/libpgagroal/network.c index dcedced3..630fa7ee 100644 --- a/src/libpgagroal/network.c +++ b/src/libpgagroal/network.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -428,6 +429,7 @@ pgagroal_socket_isvalid(int fd) int pgagroal_disconnect(int fd) { + pgagroal_log_trace("%s: fd=%d", __func__, fd); if (fd == -1) { return 1; diff --git a/src/libpgagroal/pipeline_perf.c b/src/libpgagroal/pipeline_perf.c index a8a01bc9..9eb5784e 100644 --- a/src/libpgagroal/pipeline_perf.c +++ b/src/libpgagroal/pipeline_perf.c @@ -28,6 +28,7 @@ /* pgagroal */ #include +#include #include #include #include @@ -37,7 +38,6 @@ /* system */ #include -#include #include #include #include @@ -115,11 +115,18 @@ performance_client(struct ev_loop* loop, struct ev_io* watcher, int revents) int status = MESSAGE_STATUS_ERROR; struct worker_io* wi = NULL; struct message* msg = NULL; - struct main_configuration* config = NULL; + struct main_configuration* config = (struct main_configuration*)shmem; wi = (struct worker_io*)watcher; - status = pgagroal_read_socket_message(wi->client_fd, &msg); + if (config->ev_backend == EV_BACKEND_IO_URING) + { + status = pgagroal_buffer_to_message(watcher->data, watcher->size, &msg); + } + else + { + status = pgagroal_read_socket_message(wi->client_fd, &msg); + } if (likely(status == MESSAGE_STATUS_OK)) { if (likely(msg->kind != 'X')) @@ -140,7 +147,7 @@ performance_client(struct ev_loop* loop, struct ev_io* watcher, int revents) else if (msg->kind == 'X') { saw_x = true; - running = 0; + pgagroal_ev_loop_break(loop); } } else if (status == MESSAGE_STATUS_ZERO) @@ -152,11 +159,9 @@ performance_client(struct ev_loop* loop, struct ev_io* watcher, int revents) goto client_error; } - ev_break (loop, EVBREAK_ONE); return; client_done: - config = (struct main_configuration*)shmem; pgagroal_log_debug("[C] Client done (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->client_fd, status); @@ -171,12 +176,10 @@ performance_client(struct ev_loop* loop, struct ev_io* watcher, int revents) exit_code = WORKER_SERVER_FAILURE; } - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; client_error: - config = (struct main_configuration*)shmem; pgagroal_log_warn("[C] Client error (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->client_fd, status); @@ -184,12 +187,10 @@ performance_client(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_CLIENT_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; server_error: - config = (struct main_configuration*)shmem; pgagroal_log_warn("[C] Server error (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->server_fd, status); @@ -197,8 +198,7 @@ performance_client(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; } @@ -209,18 +209,26 @@ performance_server(struct ev_loop* loop, struct ev_io* watcher, int revents) bool fatal = false; struct worker_io* wi = NULL; struct message* msg = NULL; - struct main_configuration* config = NULL; + struct main_configuration* config = (struct main_configuration*)shmem; wi = (struct worker_io*)watcher; if (wi->server_ssl == NULL) { - status = pgagroal_read_socket_message(wi->server_fd, &msg); + if (config->ev_backend == EV_BACKEND_IO_URING) + { + status = pgagroal_buffer_to_message(watcher->data, watcher->size, &msg); + } + else + { + status = pgagroal_read_socket_message(wi->server_fd, &msg); + } } else { status = pgagroal_read_ssl_message(wi->server_ssl, &msg); } + if (likely(status == MESSAGE_STATUS_OK)) { status = pgagroal_write_socket_message(wi->client_fd, msg); @@ -241,7 +249,7 @@ performance_server(struct ev_loop* loop, struct ev_io* watcher, int revents) if (fatal) { exit_code = WORKER_SERVER_FATAL; - running = 0; + pgagroal_ev_loop_break(loop); } } } @@ -254,11 +262,9 @@ performance_server(struct ev_loop* loop, struct ev_io* watcher, int revents) goto server_error; } - ev_break(loop, EVBREAK_ONE); return; client_error: - config = (struct main_configuration*)shmem; pgagroal_log_warn("[S] Client error (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->client_fd, status); @@ -266,23 +272,20 @@ performance_server(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_CLIENT_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; server_done: - config = (struct main_configuration*)shmem; pgagroal_log_debug("[S] Server done (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->server_fd, status); errno = 0; - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; server_error: - config = (struct main_configuration*)shmem; pgagroal_log_warn("[S] Server error (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->server_fd, status); @@ -290,7 +293,7 @@ performance_server(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; } diff --git a/src/libpgagroal/pipeline_session.c b/src/libpgagroal/pipeline_session.c index 66136cf1..40fd4459 100644 --- a/src/libpgagroal/pipeline_session.c +++ b/src/libpgagroal/pipeline_session.c @@ -28,6 +28,7 @@ /* pgagroal */ #include +#include #include #include #include @@ -41,7 +42,7 @@ /* system */ #include -#include +#include #include #include #include @@ -295,7 +296,14 @@ session_client(struct ev_loop* loop, struct ev_io* watcher, int revents) if (wi->client_ssl == NULL) { - status = pgagroal_read_socket_message(wi->client_fd, &msg); + if (config->ev_backend == EV_BACKEND_IO_URING) + { + status = pgagroal_buffer_to_message(watcher->data, watcher->size, &msg); + } + else + { + status = pgagroal_read_socket_message(wi->client_fd, &msg); + } } else { @@ -369,7 +377,7 @@ session_client(struct ev_loop* loop, struct ev_io* watcher, int revents) else if (msg->kind == 'X') { saw_x = true; - running = 0; + pgagroal_ev_loop_break(loop); } } else if (status == MESSAGE_STATUS_ZERO) @@ -383,7 +391,6 @@ session_client(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); - ev_break(loop, EVBREAK_ONE); return; client_done: @@ -403,8 +410,7 @@ session_client(struct ev_loop* loop, struct ev_io* watcher, int revents) exit_code = WORKER_SERVER_FAILURE; } - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; client_error: @@ -417,8 +423,8 @@ session_client(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); exit_code = WORKER_CLIENT_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; server_error: @@ -431,8 +437,8 @@ session_client(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; failover: @@ -440,8 +446,8 @@ session_client(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); exit_code = WORKER_FAILOVER; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; } @@ -452,7 +458,7 @@ session_server(struct ev_loop* loop, struct ev_io* watcher, int revents) bool fatal = false; struct worker_io* wi = NULL; struct message* msg = NULL; - struct main_configuration* config = NULL; + struct main_configuration* config = (struct main_configuration*)shmem; wi = (struct worker_io*)watcher; @@ -460,7 +466,14 @@ session_server(struct ev_loop* loop, struct ev_io* watcher, int revents) if (wi->server_ssl == NULL) { - status = pgagroal_read_socket_message(wi->server_fd, &msg); + if (config->ev_backend == EV_BACKEND_IO_URING) + { + status = pgagroal_buffer_to_message(watcher->data, watcher->size, &msg); + } + else + { + status = pgagroal_read_socket_message(wi->server_fd, &msg); + } } else { @@ -535,7 +548,7 @@ session_server(struct ev_loop* loop, struct ev_io* watcher, int revents) if (fatal) { exit_code = WORKER_SERVER_FATAL; - running = 0; + pgagroal_ev_loop_break(loop); } } } @@ -550,11 +563,9 @@ session_server(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); - ev_break(loop, EVBREAK_ONE); return; client_error: - config = (struct main_configuration*)shmem; pgagroal_log_warn("[S] Client error (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->client_fd, status); @@ -564,12 +575,11 @@ session_server(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); exit_code = WORKER_CLIENT_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; server_done: - config = (struct main_configuration*)shmem; pgagroal_log_debug("[S] Server done (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->server_fd, status); @@ -577,12 +587,10 @@ session_server(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; server_error: - config = (struct main_configuration*)shmem; pgagroal_log_warn("[S] Server error (slot %d database %s user %s): %s (socket %d status %d)", wi->slot, config->connections[wi->slot].database, config->connections[wi->slot].username, strerror(errno), wi->server_fd, status); @@ -592,8 +600,8 @@ session_server(struct ev_loop* loop, struct ev_io* watcher, int revents) client_inactive(wi->slot); exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; } diff --git a/src/libpgagroal/pipeline_transaction.c b/src/libpgagroal/pipeline_transaction.c index f874a84e..81ba0b5e 100644 --- a/src/libpgagroal/pipeline_transaction.c +++ b/src/libpgagroal/pipeline_transaction.c @@ -29,8 +29,8 @@ /* pgagroal */ #include #include +#include #include -//#include #include #include #include @@ -44,7 +44,6 @@ /* system */ #include -#include #include #include #include @@ -153,8 +152,7 @@ transaction_start(struct ev_loop* loop, struct worker_io* w) error: exit_code = WORKER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; } @@ -174,7 +172,7 @@ transaction_stop(struct ev_loop* loop, struct worker_io* w) pgagroal_write_rollback(NULL, config->connections[slot].fd); } - ev_io_stop(loop, (struct ev_io*)&server_io); + pgagroal_ev_io_stop(loop, (struct ev_io*)&server_io); pgagroal_tracking_event_slot(TRACKER_TX_RETURN_CONNECTION_STOP, w->slot); pgagroal_return_connection(slot, w->server_ssl, true); slot = -1; @@ -221,7 +219,7 @@ transaction_client(struct ev_loop* loop, struct ev_io* watcher, int revents) memcpy(&config->connections[slot].appname[0], &appname[0], MAX_APPLICATION_NAME); - ev_io_init((struct ev_io*)&server_io, transaction_server, config->connections[slot].fd, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&server_io, config->connections[slot].fd, transaction_server); server_io.client_fd = wi->client_fd; server_io.server_fd = config->connections[slot].fd; server_io.slot = slot; @@ -230,12 +228,12 @@ transaction_client(struct ev_loop* loop, struct ev_io* watcher, int revents) fatal = false; - ev_io_start(loop, (struct ev_io*)&server_io); + pgagroal_ev_io_start(loop, (struct ev_io*)&server_io); } - if (wi->client_ssl == NULL) + if (wi->server_ssl == NULL) { - status = pgagroal_read_socket_message(wi->client_fd, &msg); + status = pgagroal_buffer_to_message(watcher->data, watcher->size, &msg); } else { @@ -322,7 +320,6 @@ transaction_client(struct ev_loop* loop, struct ev_io* watcher, int revents) else if (msg->kind == 'X') { saw_x = true; - running = 0; } } else if (status == MESSAGE_STATUS_ZERO) @@ -334,7 +331,6 @@ transaction_client(struct ev_loop* loop, struct ev_io* watcher, int revents) goto client_error; } - ev_break(loop, EVBREAK_ONE); return; client_done: @@ -352,8 +348,7 @@ transaction_client(struct ev_loop* loop, struct ev_io* watcher, int revents) exit_code = WORKER_SERVER_FAILURE; } - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; client_error: @@ -364,8 +359,8 @@ transaction_client(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_CLIENT_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; server_error: @@ -376,23 +371,23 @@ transaction_client(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; failover: exit_code = WORKER_FAILOVER; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; get_error: pgagroal_log_warn("Failure during obtaining connection"); exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; } @@ -419,7 +414,14 @@ transaction_server(struct ev_loop* loop, struct ev_io* watcher, int revents) if (wi->server_ssl == NULL) { - status = pgagroal_read_socket_message(wi->server_fd, &msg); + if (config->ev_backend == EV_BACKEND_IO_URING) + { + status = pgagroal_buffer_to_message(watcher->data, watcher->size, &msg); + } + else + { + status = pgagroal_read_socket_message(wi->server_fd, &msg); + } } else { @@ -497,7 +499,7 @@ transaction_server(struct ev_loop* loop, struct ev_io* watcher, int revents) { if (has_z && !in_tx && slot != -1) { - ev_io_stop(loop, (struct ev_io*)&server_io); + pgagroal_ev_io_stop(loop, (struct ev_io*)&server_io); if (deallocate) { @@ -518,10 +520,10 @@ transaction_server(struct ev_loop* loop, struct ev_io* watcher, int revents) { if (has_z && !in_tx && slot != -1) { - ev_io_stop(loop, (struct ev_io*)&server_io); + pgagroal_ev_io_stop(loop, (struct ev_io*)&server_io); exit_code = WORKER_SERVER_FATAL; - running = 0; + pgagroal_ev_loop_break(loop); } } } @@ -534,7 +536,7 @@ transaction_server(struct ev_loop* loop, struct ev_io* watcher, int revents) goto server_error; } - ev_break(loop, EVBREAK_ONE); + pgagroal_ev_loop_break(loop); return; client_error: @@ -545,8 +547,8 @@ transaction_server(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_CLIENT_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; server_done: @@ -555,8 +557,7 @@ transaction_server(struct ev_loop* loop, struct ev_io* watcher, int revents) strerror(errno), wi->server_fd, status); errno = 0; - running = 0; - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); return; server_error: @@ -567,16 +568,16 @@ transaction_server(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; return_error: pgagroal_log_warn("Failure during connection return"); exit_code = WORKER_SERVER_FAILURE; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); return; } @@ -584,8 +585,8 @@ static void start_mgt(struct ev_loop* loop) { memset(&io_mgt, 0, sizeof(struct ev_io)); - ev_io_init(&io_mgt, accept_cb, unix_socket, EV_READ); - ev_io_start(loop, &io_mgt); + pgagroal_ev_io_accept_init(&io_mgt, unix_socket, accept_cb); + pgagroal_ev_io_start(loop, &io_mgt); } static void @@ -599,7 +600,7 @@ shutdown_mgt(struct ev_loop* loop) memset(&p, 0, sizeof(p)); snprintf(&p[0], sizeof(p), ".s.%d", getpid()); - ev_io_stop(loop, &io_mgt); + pgagroal_ev_io_stop(loop, &io_mgt); pgagroal_disconnect(unix_socket); errno = 0; pgagroal_remove_unix_socket(config->unix_socket_dir, &p[0]); @@ -609,8 +610,6 @@ shutdown_mgt(struct ev_loop* loop) static void accept_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) { - struct sockaddr_in client_addr; - socklen_t client_addr_length; int client_fd = -1; int id = -1; int32_t slot = -1; @@ -626,8 +625,7 @@ accept_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) return; } - client_addr_length = sizeof(client_addr); - client_fd = accept(watcher->fd, (struct sockaddr*)&client_addr, &client_addr_length); + client_fd = watcher->client_fd; if (client_fd == -1) { pgagroal_log_debug("accept: %s (%d)", strerror(errno), watcher->fd); diff --git a/src/libpgagroal/security.c b/src/libpgagroal/security.c index 7ffca53d..82a84b10 100644 --- a/src/libpgagroal/security.c +++ b/src/libpgagroal/security.c @@ -58,6 +58,7 @@ #include #include #include +#include #include static int get_auth_type(struct message* msg, int* auth_type); diff --git a/src/libpgagroal/utils.c b/src/libpgagroal/utils.c index eb688537..b56bbb42 100644 --- a/src/libpgagroal/utils.c +++ b/src/libpgagroal/utils.c @@ -316,10 +316,10 @@ pgagroal_read_int32(void* data) *((unsigned char*)(data + 2)), *((unsigned char*)(data + 3))}; - int32_t res = (int32_t)((bytes[0] << 24)) | - ((bytes[1] << 16)) | - ((bytes[2] << 8)) | - ((bytes[3])); + int32_t res = (int32_t)(((uint32_t)bytes[0] << 24)) | + (((uint32_t)bytes[1] << 16)) | + (((uint32_t)bytes[2] << 8)) | + (((uint32_t)bytes[3])); return res; } @@ -455,161 +455,6 @@ pgagroal_swap(unsigned int i) ((i >> 24) & 0x000000ff); } -void -pgagroal_libev_engines(void) -{ - unsigned int engines = ev_supported_backends(); - - if (engines & EVBACKEND_SELECT) - { - pgagroal_log_debug("libev available: select"); - } - if (engines & EVBACKEND_POLL) - { - pgagroal_log_debug("libev available: poll"); - } - if (engines & EVBACKEND_EPOLL) - { - pgagroal_log_debug("libev available: epoll"); - } - if (engines & EVBACKEND_LINUXAIO) - { - pgagroal_log_debug("libev available: linuxaio"); - } - if (engines & EVBACKEND_IOURING) - { - pgagroal_log_debug("libev available: iouring"); - } - if (engines & EVBACKEND_KQUEUE) - { - pgagroal_log_debug("libev available: kqueue"); - } - if (engines & EVBACKEND_DEVPOLL) - { - pgagroal_log_debug("libev available: devpoll"); - } - if (engines & EVBACKEND_PORT) - { - pgagroal_log_debug("libev available: port"); - } -} - -unsigned int -pgagroal_libev(char* engine) -{ - unsigned int engines = ev_supported_backends(); - - if (engine) - { - if (!strcmp("select", engine)) - { - if (engines & EVBACKEND_SELECT) - { - return EVBACKEND_SELECT; - } - else - { - pgagroal_log_warn("libev not available: select"); - } - } - else if (!strcmp("poll", engine)) - { - if (engines & EVBACKEND_POLL) - { - return EVBACKEND_POLL; - } - else - { - pgagroal_log_warn("libev not available: poll"); - } - } - else if (!strcmp("epoll", engine)) - { - if (engines & EVBACKEND_EPOLL) - { - return EVBACKEND_EPOLL; - } - else - { - pgagroal_log_warn("libev not available: epoll"); - } - } - else if (!strcmp("linuxaio", engine)) - { - return EVFLAG_AUTO; - } - else if (!strcmp("iouring", engine)) - { - if (engines & EVBACKEND_IOURING) - { - return EVBACKEND_IOURING; - } - else - { - pgagroal_log_warn("libev not available: iouring"); - } - } - else if (!strcmp("devpoll", engine)) - { - if (engines & EVBACKEND_DEVPOLL) - { - return EVBACKEND_DEVPOLL; - } - else - { - pgagroal_log_warn("libev not available: devpoll"); - } - } - else if (!strcmp("port", engine)) - { - if (engines & EVBACKEND_PORT) - { - return EVBACKEND_PORT; - } - else - { - pgagroal_log_warn("libev not available: port"); - } - } - else if (!strcmp("auto", engine) || !strcmp("", engine)) - { - return EVFLAG_AUTO; - } - else - { - pgagroal_log_warn("libev unknown option: %s", engine); - } - } - - return EVFLAG_AUTO; -} - -char* -pgagroal_libev_engine(unsigned int val) -{ - switch (val) - { - case EVBACKEND_SELECT: - return "select"; - case EVBACKEND_POLL: - return "poll"; - case EVBACKEND_EPOLL: - return "epoll"; - case EVBACKEND_LINUXAIO: - return "linuxaio"; - case EVBACKEND_IOURING: - return "iouring"; - case EVBACKEND_KQUEUE: - return "kqueue"; - case EVBACKEND_DEVPOLL: - return "devpoll"; - case EVBACKEND_PORT: - return "port"; - } - - return "Unknown"; -} - char* pgagroal_get_timestamp_string(time_t start_time, time_t end_time, int32_t* seconds) { diff --git a/src/libpgagroal/worker.c b/src/libpgagroal/worker.c index 2749f26f..808c2b9c 100644 --- a/src/libpgagroal/worker.c +++ b/src/libpgagroal/worker.c @@ -29,6 +29,7 @@ /* pgagroal */ #include #include +#include #include #include #include @@ -42,16 +43,15 @@ #include /* system */ -#include +#include #include #include #include #include -volatile int running = 1; volatile int exit_code = WORKER_FAILURE; -static void signal_cb(struct ev_loop* loop, ev_signal* w, int revents); +static void signal_callback(struct ev_loop* loop, ev_signal* w, int revents); void pgagroal_worker(int client_fd, char* address, char** argv) @@ -140,42 +140,46 @@ pgagroal_worker(int client_fd, char* address, char** argv) p = session_pipeline(); } - ev_io_init((struct ev_io*)&client_io, p.client, client_fd, EV_READ); + pgagroal_ev_io_receive_init(&client_io.io, client_fd, p.client); client_io.client_fd = client_fd; client_io.server_fd = config->connections[slot].fd; client_io.slot = slot; client_io.client_ssl = client_ssl; client_io.server_ssl = server_ssl; + client_io.io.ssl = (client_ssl != NULL); if (config->pipeline != PIPELINE_TRANSACTION) { - ev_io_init((struct ev_io*)&server_io, p.server, config->connections[slot].fd, EV_READ); + pgagroal_ev_io_receive_init(&server_io.io, config->connections[slot].fd, p.server); server_io.client_fd = client_fd; server_io.server_fd = config->connections[slot].fd; server_io.slot = slot; server_io.client_ssl = client_ssl; server_io.server_ssl = server_ssl; + server_io.io.ssl = (server_ssl != NULL); } - loop = ev_loop_new(pgagroal_libev(config->libev)); + loop = pgagroal_ev_init(); + if (!loop) + { + pgagroal_log_fatal("pgagroal_worker: Unable to create loop"); + exit(1); + } - ev_signal_init((struct ev_signal*)&signal_watcher, signal_cb, SIGQUIT); + pgagroal_ev_signal_init(&signal_watcher.signal, signal_callback, SIGQUIT); signal_watcher.slot = slot; - ev_signal_start(loop, (struct ev_signal*)&signal_watcher); + pgagroal_ev_signal_start(loop, &signal_watcher.signal); p.start(loop, &client_io); started = true; - ev_io_start(loop, (struct ev_io*)&client_io); + pgagroal_ev_io_start(loop, &client_io.io); if (config->pipeline != PIPELINE_TRANSACTION) { - ev_io_start(loop, (struct ev_io*)&server_io); + pgagroal_ev_io_start(loop, &server_io.io); } - while (running) - { - ev_loop(loop, 0); - } + pgagroal_ev_loop(loop); if (config->pipeline == PIPELINE_TRANSACTION) { @@ -294,18 +298,7 @@ pgagroal_worker(int client_fd, char* address, char** argv) pgagroal_pool_status(); pgagroal_log_debug("After client: PID %d Slot %d (%d)", getpid(), slot, exit_code); - if (loop) - { - ev_io_stop(loop, (struct ev_io*)&client_io); - if (config->pipeline != PIPELINE_TRANSACTION) - { - ev_io_stop(loop, (struct ev_io*)&server_io); - } - - ev_signal_stop(loop, (struct ev_signal*)&signal_watcher); - - ev_loop_destroy(loop); - } + pgagroal_ev_loop_destroy(loop); free(address); @@ -318,7 +311,7 @@ pgagroal_worker(int client_fd, char* address, char** argv) } static void -signal_cb(struct ev_loop* loop, ev_signal* w, int revents) +signal_callback(struct ev_loop* loop, ev_signal* w, int revents) { struct signal_info* si; @@ -327,6 +320,6 @@ signal_cb(struct ev_loop* loop, ev_signal* w, int revents) pgagroal_log_debug("pgagroal: signal %d for slot %d", si->signal.signum, si->slot); exit_code = WORKER_SHUTDOWN; - running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); } diff --git a/src/main.c b/src/main.c index aab3d117..7df02ded 100644 --- a/src/main.c +++ b/src/main.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -121,10 +121,10 @@ static void start_mgt(void) { memset(&io_mgt, 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_mgt, accept_mgt_cb, unix_management_socket, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_mgt, unix_management_socket, accept_mgt_cb); io_mgt.socket = unix_management_socket; io_mgt.argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_mgt); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_mgt); } static void @@ -134,7 +134,6 @@ shutdown_mgt(void) config = (struct main_configuration*)shmem; - ev_io_stop(main_loop, (struct ev_io*)&io_mgt); pgagroal_disconnect(unix_management_socket); errno = 0; pgagroal_remove_unix_socket(config->unix_socket_dir, MAIN_UDS); @@ -145,10 +144,10 @@ static void start_transfer(void) { memset(&io_transfer, 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_transfer, accept_transfer_cb, unix_transfer_socket, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_transfer, unix_transfer_socket, accept_transfer_cb); io_transfer.socket = unix_transfer_socket; io_transfer.argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_transfer); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_transfer); } static void @@ -158,7 +157,6 @@ shutdown_transfer(void) config = (struct main_configuration*)shmem; - ev_io_stop(main_loop, (struct ev_io*)&io_transfer); pgagroal_disconnect(unix_transfer_socket); errno = 0; pgagroal_remove_unix_socket(config->unix_socket_dir, TRANSFER_UDS); @@ -169,10 +167,10 @@ static void start_uds(void) { memset(&io_uds, 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_uds, accept_main_cb, unix_pgsql_socket, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_uds, unix_pgsql_socket, accept_main_cb); io_uds.socket = unix_pgsql_socket; io_uds.argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_uds); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_uds); } static void @@ -186,7 +184,6 @@ shutdown_uds(void) memset(&pgsql, 0, sizeof(pgsql)); snprintf(&pgsql[0], sizeof(pgsql), ".s.PGSQL.%d", config->common.port); - ev_io_stop(main_loop, (struct ev_io*)&io_uds); pgagroal_disconnect(unix_pgsql_socket); errno = 0; pgagroal_remove_unix_socket(config->unix_socket_dir, &pgsql[0]); @@ -201,10 +198,10 @@ start_io(void) int sockfd = *(main_fds + i); memset(&io_main[i], 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_main[i], accept_main_cb, sockfd, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_main[i], sockfd, accept_main_cb); io_main[i].socket = sockfd; io_main[i].argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_main[i]); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_main[i]); } } @@ -213,7 +210,6 @@ shutdown_io(void) { for (int i = 0; i < main_fds_length; i++) { - ev_io_stop(main_loop, (struct ev_io*)&io_main[i]); pgagroal_disconnect(io_main[i].socket); errno = 0; } @@ -227,10 +223,10 @@ start_metrics(void) int sockfd = *(metrics_fds + i); memset(&io_metrics[i], 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_metrics[i], accept_metrics_cb, sockfd, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_metrics[i], sockfd, accept_metrics_cb); io_metrics[i].socket = sockfd; io_metrics[i].argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_metrics[i]); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_metrics[i]); } } @@ -239,7 +235,6 @@ shutdown_metrics(void) { for (int i = 0; i < metrics_fds_length; i++) { - ev_io_stop(main_loop, (struct ev_io*)&io_metrics[i]); pgagroal_disconnect(io_metrics[i].socket); errno = 0; } @@ -253,10 +248,10 @@ start_management(void) int sockfd = *(management_fds + i); memset(&io_management[i], 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_management[i], accept_management_cb, sockfd, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_management[i], sockfd, accept_management_cb); io_management[i].socket = sockfd; io_management[i].argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_management[i]); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_management[i]); } } @@ -265,7 +260,6 @@ shutdown_management(void) { for (int i = 0; i < management_fds_length; i++) { - ev_io_stop(main_loop, (struct ev_io*)&io_management[i]); pgagroal_disconnect(io_management[i].socket); errno = 0; } @@ -977,29 +971,23 @@ main(int argc, char** argv) goto error; } - /* libev */ - main_loop = ev_default_loop(pgagroal_libev(config->libev)); + main_loop = pgagroal_ev_init(); if (!main_loop) { - pgagroal_log_fatal("pgagroal: No loop implementation (%x) (%x)", - pgagroal_libev(config->libev), ev_supported_backends()); -#ifdef HAVE_LINUX - sd_notifyf(0, "STATUS=No loop implementation (%x) (%x)", pgagroal_libev(config->libev), ev_supported_backends()); -#endif goto error; } - ev_signal_init((struct ev_signal*)&signal_watcher[0], shutdown_cb, SIGTERM); - ev_signal_init((struct ev_signal*)&signal_watcher[1], reload_cb, SIGHUP); - ev_signal_init((struct ev_signal*)&signal_watcher[2], shutdown_cb, SIGINT); - ev_signal_init((struct ev_signal*)&signal_watcher[3], graceful_cb, SIGTRAP); - ev_signal_init((struct ev_signal*)&signal_watcher[4], coredump_cb, SIGABRT); - ev_signal_init((struct ev_signal*)&signal_watcher[5], shutdown_cb, SIGALRM); + pgagroal_ev_signal_init((struct ev_signal*)&signal_watcher[0], shutdown_cb, SIGTERM); + pgagroal_ev_signal_init((struct ev_signal*)&signal_watcher[1], reload_cb, SIGHUP); + pgagroal_ev_signal_init((struct ev_signal*)&signal_watcher[2], shutdown_cb, SIGINT); + pgagroal_ev_signal_init((struct ev_signal*)&signal_watcher[3], graceful_cb, SIGTRAP); + pgagroal_ev_signal_init((struct ev_signal*)&signal_watcher[4], coredump_cb, SIGABRT); + pgagroal_ev_signal_init((struct ev_signal*)&signal_watcher[5], shutdown_cb, SIGALRM); for (int i = 0; i < 6; i++) { signal_watcher[i].slot = -1; - ev_signal_start(main_loop, (struct ev_signal*)&signal_watcher[i]); + pgagroal_ev_signal_start(main_loop, (struct ev_signal*)&signal_watcher[i]); } if (config->pipeline == PIPELINE_PERFORMANCE) @@ -1057,37 +1045,37 @@ main(int argc, char** argv) if (config->idle_timeout > 0) { - ev_periodic_init (&idle_timeout, idle_timeout_cb, 0., - MAX(1. * config->idle_timeout / 2., 5.), 0); - ev_periodic_start (main_loop, &idle_timeout); + pgagroal_ev_periodic_init (&idle_timeout, idle_timeout_cb, + 1000 * MAX(1. * config->idle_timeout / 2., 5.)); + pgagroal_ev_periodic_start (main_loop, &idle_timeout); } if (config->max_connection_age > 0) { - ev_periodic_init (&max_connection_age, max_connection_age_cb, 0., - MAX(1. * config->max_connection_age / 2., 5.), 0); - ev_periodic_start (main_loop, &max_connection_age); + pgagroal_ev_periodic_init (&max_connection_age, max_connection_age_cb, + 1000 * MAX(1. * config->max_connection_age / 2., 5.)); + pgagroal_ev_periodic_start (main_loop, &max_connection_age); } if (config->validation == VALIDATION_BACKGROUND) { - ev_periodic_init (&validation, validation_cb, 0., - MAX(1. * config->background_interval, 5.), 0); - ev_periodic_start (main_loop, &validation); + pgagroal_ev_periodic_init (&validation, validation_cb, + 1000 * MAX(1. * config->background_interval, 5.)); + pgagroal_ev_periodic_start (main_loop, &validation); } if (config->disconnect_client > 0) { - ev_periodic_init (&disconnect_client, disconnect_client_cb, 0., - MIN(300., MAX(1. * config->disconnect_client / 2., 1.)), 0); - ev_periodic_start (main_loop, &disconnect_client); + pgagroal_ev_periodic_init (&disconnect_client, disconnect_client_cb, + 1000 * MIN(300., MAX(1. * config->disconnect_client / 2., 1.))); + pgagroal_ev_periodic_start (main_loop, &disconnect_client); } if (config->rotate_frontend_password_timeout > 0) { - ev_periodic_init (&rotate_frontend_password, rotate_frontend_password_cb, 0., - config->rotate_frontend_password_timeout, 0); - ev_periodic_start (main_loop, &rotate_frontend_password); + pgagroal_ev_periodic_init (&rotate_frontend_password, rotate_frontend_password_cb, + 1000 * config->rotate_frontend_password_timeout); + pgagroal_ev_periodic_start (main_loop, &rotate_frontend_password); } if (config->common.metrics > 0) @@ -1157,8 +1145,7 @@ main(int argc, char** argv) { pgagroal_log_debug("Remote management: %d", *(management_fds + i)); } - pgagroal_libev_engines(); - pgagroal_log_debug("libev engine: %s", pgagroal_libev_engine(ev_backend(main_loop))); + pgagroal_log_debug("Pipeline: %d", config->pipeline); pgagroal_log_debug("Pipeline size: %lu", pipeline_shmem_size); pgagroal_log_debug("%s", OpenSSL_version(OPENSSL_VERSION)); @@ -1178,6 +1165,7 @@ main(int argc, char** argv) { if (!fork()) { + pgagroal_ev_fork(main_loop); shutdown_ports(); pgagroal_prefill_if_can(false, true); } @@ -1190,10 +1178,7 @@ main(int argc, char** argv) "MAINPID=%lu", (unsigned long)getpid()); #endif - while (keep_running) - { - ev_loop(main_loop, 0); - } + pgagroal_ev_loop(main_loop); pgagroal_log_info("pgagroal: shutdown"); #ifdef HAVE_LINUX @@ -1206,7 +1191,10 @@ main(int argc, char** argv) struct client* c = clients; while (c != NULL) { - kill(c->pid, SIGQUIT); + if (kill(c->pid, SIGQUIT)) + { + pgagroal_log_debug("kill: %s", strerror(errno)); + } c = c->next; } } @@ -1218,12 +1206,7 @@ main(int argc, char** argv) shutdown_io(); shutdown_uds(); - for (int i = 0; i < 6; i++) - { - ev_signal_stop(main_loop, (struct ev_signal*)&signal_watcher[i]); - } - - ev_loop_destroy(main_loop); + pgagroal_ev_loop_destroy(main_loop); free(main_fds); free(metrics_fds); @@ -1251,7 +1234,6 @@ static void accept_main_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) { struct sockaddr_in6 client_addr; - socklen_t client_addr_length; int client_fd; char address[INET6_ADDRSTRLEN]; pid_t pid; @@ -1272,8 +1254,7 @@ accept_main_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) memset(&address, 0, sizeof(address)); - client_addr_length = sizeof(client_addr); - client_fd = accept(watcher->fd, (struct sockaddr*)&client_addr, &client_addr_length); + client_fd = watcher->client_fd; if (client_fd == -1) { if (accept_fatal(errno) && keep_running) @@ -1360,20 +1341,26 @@ accept_main_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) } memcpy(addr, address, strlen(address)); - ev_loop_fork(loop); + /* Prevent SIGINT sent to the parent's controlling terminal + * from irradiating to the children. pgagroal wants the + * children processes to be independent. */ + if (setpgid(0, 0) == -1) + { + pgagroal_log_error("%s: setpgid: %s", __func__, strerror(errno)); + exit(1); + } + + pgagroal_ev_fork(loop); shutdown_ports(); /* We are leaving the socket descriptor valid such that the client won't reuse it */ pgagroal_worker(client_fd, addr, ai->argv); } - pgagroal_disconnect(client_fd); } static void accept_mgt_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) { - struct sockaddr_in6 client_addr; - socklen_t client_addr_length; int client_fd; int32_t id; pid_t pid; @@ -1398,8 +1385,7 @@ accept_mgt_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; - client_addr_length = sizeof(client_addr); - client_fd = accept(watcher->fd, (struct sockaddr*)&client_addr, &client_addr_length); + client_fd = watcher->client_fd; pgagroal_prometheus_self_sockets_add(); @@ -1614,8 +1600,7 @@ accept_mgt_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) pgagroal_management_response_ok(NULL, client_fd, start_time, end_time, compression, encryption, payload); - ev_break(loop, EVBREAK_ALL); - keep_running = 0; + pgagroal_ev_loop_break(loop); } else if (id == MANAGEMENT_CANCEL_SHUTDOWN) { @@ -1839,7 +1824,7 @@ accept_mgt_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) else if (pid == 0) { struct json* pyl = NULL; - + shutdown_ports(); pgagroal_json_clone(payload, &pyl); @@ -1897,8 +1882,8 @@ accept_mgt_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) if (atomic_load(&config->active_connections) == 0) { pgagroal_pool_status(); - keep_running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); } } @@ -1924,9 +1909,7 @@ accept_mgt_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) static void accept_transfer_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) { - struct sockaddr_in6 client_addr; - socklen_t client_addr_length; - int client_fd = 0; + int client_fd; int id = -1; pid_t pid = 0; int32_t slot = -1; @@ -1943,8 +1926,7 @@ accept_transfer_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; - client_addr_length = sizeof(client_addr); - client_fd = accept(watcher->fd, (struct sockaddr*)&client_addr, &client_addr_length); + client_fd = watcher->client_fd; pgagroal_prometheus_self_sockets_add(); @@ -2110,8 +2092,6 @@ accept_transfer_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) static void accept_metrics_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) { - struct sockaddr_in6 client_addr; - socklen_t client_addr_length; int client_fd; struct main_configuration* config; @@ -2126,8 +2106,7 @@ accept_metrics_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; - client_addr_length = sizeof(client_addr); - client_fd = accept(watcher->fd, (struct sockaddr*)&client_addr, &client_addr_length); + client_fd = watcher->client_fd; pgagroal_prometheus_self_sockets_add(); @@ -2172,7 +2151,7 @@ accept_metrics_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) if (!fork()) { - ev_loop_fork(loop); + pgagroal_ev_fork(loop); shutdown_ports(); /* We are leaving the socket descriptor valid such that the client won't reuse it */ pgagroal_prometheus(client_fd); @@ -2186,7 +2165,6 @@ static void accept_management_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) { struct sockaddr_in6 client_addr; - socklen_t client_addr_length; int client_fd; char address[INET6_ADDRSTRLEN]; struct main_configuration* config; @@ -2197,15 +2175,13 @@ accept_management_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) errno = 0; return; } + errno = 0; memset(&address, 0, sizeof(address)); config = (struct main_configuration*)shmem; - errno = 0; - - client_addr_length = sizeof(client_addr); - client_fd = accept(watcher->fd, (struct sockaddr*)&client_addr, &client_addr_length); + client_fd = watcher->client_fd; pgagroal_prometheus_self_sockets_add(); @@ -2260,7 +2236,7 @@ accept_management_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) } memcpy(addr, address, strlen(address)); - ev_loop_fork(loop); + pgagroal_ev_fork(loop); shutdown_ports(); /* We are leaving the socket descriptor valid such that the client won't reuse it */ pgagroal_remote_management(client_fd, addr); @@ -2275,7 +2251,7 @@ shutdown_cb(struct ev_loop* loop, ev_signal* w, int revents) { pgagroal_log_debug("pgagroal: shutdown requested"); pgagroal_pool_status(); - ev_break(loop, EVBREAK_ALL); + pgagroal_ev_loop_break(loop); keep_running = 0; } @@ -2301,8 +2277,8 @@ graceful_cb(struct ev_loop* loop, ev_signal* w, int revents) if (atomic_load(&config->active_connections) == 0) { pgagroal_pool_status(); - keep_running = 0; - ev_break(loop, EVBREAK_ALL); + + pgagroal_ev_loop_break(loop); } } @@ -2326,6 +2302,7 @@ idle_timeout_cb(struct ev_loop* loop, ev_periodic* w, int revents) /* pgagroal_idle_timeout() is always in a fork() */ if (!fork()) { + pgagroal_ev_fork(loop); shutdown_ports(); pgagroal_idle_timeout(); } @@ -2343,6 +2320,7 @@ max_connection_age_cb(struct ev_loop* loop, ev_periodic* w, int revents) /* max_connection_age() is always in a fork() */ if (!fork()) { + pgagroal_ev_fork(loop); shutdown_ports(); pgagroal_max_connection_age(); } @@ -2360,6 +2338,7 @@ validation_cb(struct ev_loop* loop, ev_periodic* w, int revents) /* pgagroal_validation() is always in a fork() */ if (!fork()) { + pgagroal_ev_fork(loop); shutdown_ports(); pgagroal_validation(); } @@ -2377,6 +2356,7 @@ disconnect_client_cb(struct ev_loop* loop, ev_periodic* w, int revents) /* main_pipeline.periodic is always in a fork() */ if (!fork()) { + pgagroal_ev_fork(loop); shutdown_ports(); main_pipeline.periodic(); } diff --git a/src/vault.c b/src/vault.c index 9d60632e..65e5988a 100644 --- a/src/vault.c +++ b/src/vault.c @@ -29,6 +29,7 @@ /* pgagroal */ #include #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -78,7 +80,6 @@ static int router(SSL* ccl, SSL* ssl, int client_fd); static bool is_ssl_request(int client_fd); static int get_connection_state(struct vault_configuration* config, int client_fd); -static volatile int keep_running = 1; static char** argv_ptr; static struct ev_loop* main_loop = NULL; static struct accept_io io_main[MAX_FDS]; @@ -354,10 +355,10 @@ start_vault_io(void) int sockfd = *(server_fds + i); memset(&io_main[i], 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_main, accept_vault_cb, sockfd, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_main[i], sockfd, accept_vault_cb); io_main[i].socket = sockfd; io_main[i].argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_main[i]); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_main[i]); } } @@ -366,7 +367,6 @@ shutdown_vault_io(void) { for (int i = 0; i < server_fds_length; i++) { - ev_io_stop(main_loop, (struct ev_io*)&io_main[i]); pgagroal_disconnect(io_main[i].socket); errno = 0; } @@ -380,10 +380,10 @@ start_metrics(void) int sockfd = *(metrics_fds + i); memset(&io_metrics[i], 0, sizeof(struct accept_io)); - ev_io_init((struct ev_io*)&io_metrics[i], accept_metrics_cb, sockfd, EV_READ); + pgagroal_ev_io_accept_init((struct ev_io*)&io_metrics[i], sockfd, accept_metrics_cb); io_metrics[i].socket = sockfd; io_metrics[i].argv = argv_ptr; - ev_io_start(main_loop, (struct ev_io*)&io_metrics[i]); + pgagroal_ev_io_start(main_loop, (struct ev_io*)&io_metrics[i]); } } @@ -392,7 +392,6 @@ shutdown_metrics(void) { for (int i = 0; i < metrics_fds_length; i++) { - ev_io_stop(main_loop, (struct ev_io*)&io_metrics[i]); pgagroal_disconnect(io_metrics[i].socket); errno = 0; } @@ -614,19 +613,18 @@ main(int argc, char** argv) } // -- Initialize the watcher and start loop -- - main_loop = ev_default_loop(0); - + main_loop = pgagroal_ev_init(); if (!main_loop) { errx(1, "pgagroal-vault: No loop implementation"); } - ev_signal_init((struct ev_signal*)&signal_watcher[0], shutdown_cb, SIGTERM); + pgagroal_ev_signal_init((struct ev_signal*)&signal_watcher[0], shutdown_cb, SIGTERM); for (int i = 0; i < 1; i++) { signal_watcher[i].slot = -1; - ev_signal_start(main_loop, (struct ev_signal*)&signal_watcher[i]); + pgagroal_ev_signal_start(main_loop, (struct ev_signal*)&signal_watcher[i]); } start_vault_io(); @@ -668,21 +666,13 @@ main(int argc, char** argv) pgagroal_log_debug("Metrics: %d", *(metrics_fds + i)); } - while (keep_running) - { - ev_loop(main_loop, 0); - } + pgagroal_ev_loop(main_loop); pgagroal_log_info("pgagroal-vault: shutdown"); shutdown_ports(); - for (int i = 0; i < 1; i++) - { - ev_signal_stop(main_loop, (struct ev_signal*)&signal_watcher[i]); - } - - ev_loop_destroy(main_loop); + pgagroal_ev_loop_destroy(main_loop); // -- Free all memory -- free(metrics_fds); @@ -697,15 +687,15 @@ static void shutdown_cb(struct ev_loop* loop, ev_signal* w, int revents) { pgagroal_log_debug("pgagroal-vault: Shutdown requested"); - ev_break(loop, EVBREAK_ALL); - keep_running = 0; + pgagroal_ev_loop_break(loop); + } static void accept_vault_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) { struct sockaddr_in6 client_addr; - socklen_t client_addr_length; + int client_fd; char address[INET6_ADDRSTRLEN]; pid_t pid; @@ -724,12 +714,11 @@ accept_vault_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) memset(&address, 0, sizeof(address)); - client_addr_length = sizeof(client_addr); - client_fd = accept(watcher->fd, (struct sockaddr*)&client_addr, &client_addr_length); + client_fd = watcher->client_fd; if (client_fd == -1) { - if (accept_fatal(errno) && keep_running) + if (accept_fatal(errno) && pgagroal_ev_loop_is_running(loop)) { pgagroal_log_warn("accept_vault_cb: Restarting listening port due to: %s (%d)", strerror(errno), watcher->fd); @@ -746,6 +735,7 @@ accept_vault_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) if (!fork()) { + pgagroal_ev_fork(loop); shutdown_ports(); } @@ -781,7 +771,7 @@ accept_vault_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) } memcpy(addr, address, strlen(address)); - ev_loop_fork(loop); + pgagroal_ev_fork(loop); shutdown_ports(); // Handle http request @@ -822,7 +812,7 @@ accept_metrics_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) if (client_fd == -1) { - if (accept_fatal(errno) && keep_running) + if (accept_fatal(errno) && pgagroal_ev_loop_is_running(main_loop)) { pgagroal_log_warn("Restarting listening port due to: %s (%d)", strerror(errno), watcher->fd); @@ -861,7 +851,7 @@ accept_metrics_cb(struct ev_loop* loop, struct ev_io* watcher, int revents) if (!fork()) { - ev_loop_fork(loop); + pgagroal_ev_fork(loop); shutdown_ports(); /* We are leaving the socket descriptor valid such that the client won't reuse it */ pgagroal_vault_prometheus(client_fd);