From b4c56fc8f8b95936d5aed87f7ebbc1c90ca66c93 Mon Sep 17 00:00:00 2001 From: Andrei Strelkovskii Date: Sat, 24 Aug 2024 15:20:42 +0300 Subject: [PATCH] io_setup may return EAGAIN - it should be retried (for a reasonable time interval) (#1834) * io_setup may return EAGAIN - it should be retried (for a reasonable time interval) * increasing retry count for io_setup EAGAIN * io_setup EAGAIN retries - more logs * io_setup EAGAIN retries - even more logs --- cloud/storage/core/libs/aio/service.cpp | 31 ++++++++++++++++++---- cloud/storage/core/libs/aio/service_ut.cpp | 27 +++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/cloud/storage/core/libs/aio/service.cpp b/cloud/storage/core/libs/aio/service.cpp index 89b0cc72b9..5bb1b50c2e 100644 --- a/cloud/storage/core/libs/aio/service.cpp +++ b/cloud/storage/core/libs/aio/service.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -45,12 +46,32 @@ class TAsyncIOContext io_context* Context = nullptr; public: - explicit TAsyncIOContext(size_t nr) + explicit TAsyncIOContext(int nr) { - int ret = io_setup(nr, &Context); - Y_ABORT_UNLESS(ret == 0, - "unable to initialize context: %s", - LastSystemErrorText(-ret)); + int code = 0; + int iterations = 0; + const int maxIterations = 1000; + const auto waitTime = TDuration::MilliSeconds(100); + while (iterations < maxIterations) { + ++iterations; + code = io_setup(nr, &Context); + if (code == -EAGAIN) { + const auto aioNr = + TIFStream("/proc/sys/fs/aio-nr").ReadLine(); + const auto aioMaxNr = + TIFStream("/proc/sys/fs/aio-max-nr").ReadLine(); + Cerr << "retrying EAGAIN from io_setup, aio-nr/max: " + << aioNr << "/" << aioMaxNr << Endl; + Sleep(waitTime); + } else { + break; + } + } + + Y_ABORT_UNLESS(code == 0, + "unable to initialize context: %s, iterations: %d", + LastSystemErrorText(-code), + iterations); } ~TAsyncIOContext() diff --git a/cloud/storage/core/libs/aio/service_ut.cpp b/cloud/storage/core/libs/aio/service_ut.cpp index 99b5e694dc..2c56aae08c 100644 --- a/cloud/storage/core/libs/aio/service_ut.cpp +++ b/cloud/storage/core/libs/aio/service_ut.cpp @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include namespace NCloud { @@ -81,6 +83,31 @@ Y_UNIT_TEST_SUITE(TAioTest) UNIT_ASSERT_VALUES_EQUAL('X', val); } } + + Y_UNIT_TEST(ShouldRetryIoSetupErrors) + { + const auto eventCountLimit = + FromString(TIFStream("/proc/sys/fs/aio-max-nr").ReadLine()); + const auto service1EventCount = eventCountLimit / 2; + auto service1 = CreateAIOService(service1EventCount); + auto promise1 = NThreading::NewPromise(); + auto promise2 = NThreading::NewPromise(); + SystemThreadFactory()->Run([&] () { + promise1.SetValue(); + + const auto service2EventCount = + eventCountLimit - service1EventCount + 1; + // should cause EAGAIN from io_setup until service1 is destroyed + auto service2 = CreateAIOService(service2EventCount); + Y_UNUSED(service2); + promise2.SetValue(); + }); + + promise1.GetFuture().GetValueSync(); + Sleep(TDuration::Seconds(1)); + service1.reset(); + promise2.GetFuture().GetValue(TDuration::Seconds(5)); + } } } // namespace NCloud