From b1cc0af847baaaf79387ddd25051f9c333203a28 Mon Sep 17 00:00:00 2001 From: Gil Bregman Date: Sun, 22 Dec 2024 11:17:58 +0200 Subject: [PATCH] Do not process gRPC calls when gateway is going down. Fixes #992 Signed-off-by: Gil Bregman --- .env | 2 ++ ceph-nvmeof.conf | 2 +- control/grpc.py | 14 +++++++++++--- control/prometheus.py | 2 +- control/server.py | 2 ++ control/state.py | 2 +- tests/ceph-nvmeof.no-huge.conf | 5 ++++- tests/ceph-nvmeof.tls.conf | 5 ++++- tests/test_cli.py | 8 ++++++++ tests/test_cli_change_lb.py | 4 ++-- 10 files changed, 36 insertions(+), 10 deletions(-) diff --git a/.env b/.env index 968097cd..57b848db 100644 --- a/.env +++ b/.env @@ -101,3 +101,5 @@ DHCHAP_KEY6="DHHC-1:01:Bu4tZd7X2oW7XxmVH5tGCdoS30pDX6bZvexHYoudeVlJW9yz:" DHCHAP_KEY7="DHHC-1:01:JPJkDQ2po2FfLmKYlTF/sJ2HzVO/FKWxgXKE/H6XfL8ogQ1T:" DHCHAP_KEY8="DHHC-1:01:e0B0vDxKleDzYVtG42xqFvoWZfiufkoywmfRKrETzayRdf1j:" DHCHAP_KEY9="DHHC-1:01:KD+sfH3/o2bRQoV0ESjBUywQlMnSaYpZISUbVa0k0nsWpNST:" +DHCHAP_KEY10="DHHC-1:00:rWf0ZFYO7IgWGttM8w6jUrAY4cTQyqyXPdmxHeOSve3w5QU9:" +DHCHAP_KEY11="DHHC-1:02:j3uUz05r5aQy42vX4tDXqVf9HgUPPdEp3kXTgUWl9EphsG7jwpr9KSIt3bmRLXBijPTIDQ==:" diff --git a/ceph-nvmeof.conf b/ceph-nvmeof.conf index ea3c9c08..86875fe3 100644 --- a/ceph-nvmeof.conf +++ b/ceph-nvmeof.conf @@ -32,7 +32,7 @@ max_ns_to_change_lb_grp = 8 #verify_nqns = True #allowed_consecutive_spdk_ping_failures = 1 #spdk_ping_interval_in_seconds = 2.0 -#max_hosts_per_namespace = 1 +#max_hosts_per_namespace = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_namespaces = 1024 diff --git a/control/grpc.py b/control/grpc.py index 4261d2bc..9d277071 100644 --- a/control/grpc.py +++ b/control/grpc.py @@ -362,7 +362,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp self.host_name = socket.gethostname() self.verify_nqns = self.config.getboolean_with_default("gateway", "verify_nqns", True) self.gateway_group = self.config.get_with_default("gateway", "group", "") - self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 1) + self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 8) self.max_namespaces_with_netmask = self.config.getint_with_default("gateway", "max_namespaces_with_netmask", 1000) self.max_subsystems = self.config.getint_with_default("gateway", "max_subsystems", GatewayService.MAX_SUBSYSTEMS_DEFAULT) self.max_namespaces = self.config.getint_with_default("gateway", "max_namespaces", GatewayService.MAX_NAMESPACES_DEFAULT) @@ -386,6 +386,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp self._init_cluster_context() self.subsys_max_ns = {} self.host_info = SubsystemHostAuth() + self.up_and_running = True self.rebalance = Rebalance(self) def get_directories_for_key_file(self, key_type : str, subsysnqn : str, create_dir : bool = False) -> []: @@ -668,6 +669,12 @@ def execute_grpc_function(self, func, request, context): called might take OMAP lock internally, however does NOT ensure taking OMAP lock in any way. """ + + if not self.up_and_running: + errmsg = "Gateway is going down" + self.logger.error(errmsg) + return pb2.req_status(status=errno.ESHUTDOWN, error_message=errmsg) + return self.omap_lock.execute_omap_locking_function(self._grpc_function_with_lock, func, request, context) def create_bdev(self, anagrp: int, name, uuid, rbd_pool_name, rbd_image_name, block_size, create_image, rbd_image_size, context, peer_msg = ""): @@ -988,7 +995,7 @@ def create_subsystem_safe(self, request, context): else: subsys_using_serial = self.serial_number_already_used(context, request.serial_number) if subsys_using_serial: - errmsg = f"Serial number {request.serial_number} already used by subsystem {subsys_using_serial}" + errmsg = f"Serial number {request.serial_number} is already used by subsystem {subsys_using_serial}" if subsys_already_exists or subsys_using_serial: errmsg = f"{create_subsystem_error_prefix}: {errmsg}" self.logger.error(f"{errmsg}") @@ -1527,7 +1534,8 @@ def namespace_change_load_balancing_group_safe(self, request, context): grps_list = [] peer_msg = self.get_peer_message(context) change_lb_group_failure_prefix = f"Failure changing load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn}" - self.logger.info(f"Received auto {request.auto_lb_logic} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}") + auto_lb_msg = "auto" if request.auto_lb_logic else "manual" + self.logger.info(f"Received {auto_lb_msg} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}") if not request.subsystem_nqn: errmsg = f"Failure changing load balancing group for namespace, missing subsystem NQN" diff --git a/control/prometheus.py b/control/prometheus.py index ec2fa115..411b6057 100644 --- a/control/prometheus.py +++ b/control/prometheus.py @@ -109,7 +109,7 @@ def __init__(self, spdk_rpc_client, config, gateway_rpc): self.gw_config = config _bdev_pools = config.get_with_default('gateway', 'prometheus_bdev_pools', '') self.bdev_pools = _bdev_pools.split(',') if _bdev_pools else [] - self.interval = config.getint_with_default('gateway', 'prometheus_stats_inteval', 10) + self.interval = config.getint_with_default('gateway', 'prometheus_stats_interval', 10) self.lock = threading.Lock() self.hostname = os.getenv('NODE_NAME') or os.getenv('HOSTNAME') diff --git a/control/server.py b/control/server.py index 39c9279e..f1e889a1 100644 --- a/control/server.py +++ b/control/server.py @@ -125,6 +125,8 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): """Cleans up SPDK and server instances.""" + if self.gateway_rpc: + self.gateway_rpc.up_and_running = False if exc_type is not None: self.logger.exception("GatewayServer exception occurred:") else: diff --git a/control/state.py b/control/state.py index 27e5d523..90837165 100644 --- a/control/state.py +++ b/control/state.py @@ -602,7 +602,7 @@ def cleanup_omap(self, omap_lock = None): if omap_lock and omap_lock.omap_file_lock_duration > 0: try: omap_lock.unlock_omap() - except Exceprion: + except Exception: pass if self.ioctx: try: diff --git a/tests/ceph-nvmeof.no-huge.conf b/tests/ceph-nvmeof.no-huge.conf index 55665105..d5b62aaf 100644 --- a/tests/ceph-nvmeof.no-huge.conf +++ b/tests/ceph-nvmeof.no-huge.conf @@ -17,6 +17,9 @@ state_update_notify = True state_update_timeout_in_msec = 2000 state_update_interval_sec = 5 enable_spdk_discovery_controller = False +rebalance_period_sec = 7 +max_gws_in_grp = 16 +max_ns_to_change_lb_grp = 8 #omap_file_lock_duration = 20 #omap_file_lock_retries = 30 #omap_file_lock_retry_sleep_interval = 1.0 @@ -29,7 +32,7 @@ enable_spdk_discovery_controller = False #verify_nqns = True #allowed_consecutive_spdk_ping_failures = 1 #spdk_ping_interval_in_seconds = 2.0 -#max_hosts_per_namespace = 1 +#max_hosts_per_namespace = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_namespaces = 1024 diff --git a/tests/ceph-nvmeof.tls.conf b/tests/ceph-nvmeof.tls.conf index 14be0cc3..42334e0b 100644 --- a/tests/ceph-nvmeof.tls.conf +++ b/tests/ceph-nvmeof.tls.conf @@ -16,6 +16,9 @@ enable_auth = True state_update_notify = True state_update_interval_sec = 5 enable_spdk_discovery_controller = False +rebalance_period_sec = 7 +max_gws_in_grp = 16 +max_ns_to_change_lb_grp = 8 #omap_file_lock_duration = 20 #omap_file_lock_retries = 30 #omap_file_lock_retry_sleep_interval = 1.0 @@ -28,7 +31,7 @@ enable_spdk_discovery_controller = False #verify_nqns = True #allowed_consecutive_spdk_ping_failures = 1 #spdk_ping_interval_in_seconds = 2.0 -#max_hosts_per_namespace = 1 +#max_hosts_per_namespace = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_namespaces = 1024 diff --git a/tests/test_cli.py b/tests/test_cli.py index 8559b8d4..e2dfc2d2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -32,6 +32,7 @@ subsystem5 = "nqn.2016-06.io.spdk:cnode5" subsystem6 = "nqn.2016-06.io.spdk:cnode6" subsystem7 = "nqn.2016-06.io.spdk:cnode7" +subsystem8 = "nqn.2016-06.io.spdk:cnode8" discovery_nqn = "nqn.2014-08.org.nvmexpress.discovery" serial = "Ceph00000000000001" uuid = "948878ee-c3b2-4d58-a29b-2cff713fc02d" @@ -70,6 +71,7 @@ def gateway(config): port = config.getint("gateway", "port") config.config["gateway"]["group"] = group_name config.config["gateway"]["max_namespaces_with_netmask"] = "3" + config.config["gateway"]["max_hosts_per_namespace"] = "1" config.config["gateway"]["max_subsystems"] = "3" config.config["gateway"]["max_namespaces"] = "12" config.config["gateway"]["max_namespaces_per_subsystem"] = "11" @@ -212,6 +214,9 @@ def test_create_subsystem(self, caplog, gateway): assert f'"nqn": "{subsystem}"' in caplog.text assert f'"max_namespaces": 2049' in caplog.text caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem, "--max-namespaces", "2049", "--no-group-append"]) + assert f"Failure creating subsystem {subsystem}: Subsystem already exists" in caplog.text + caplog.clear() cli(["subsystem", "add", "--subsystem", subsystem2, "--serial-number", serial, "--no-group-append"]) assert f"Adding subsystem {subsystem2}: Successful" in caplog.text caplog.clear() @@ -251,6 +256,9 @@ def test_create_subsystem(self, caplog, gateway): assert subs_list.subsystems[0].nqn == subsystem assert subs_list.subsystems[1].nqn == subsystem2 caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem8, "--serial-number", serial, "--no-group-append"]) + assert f"Failure creating subsystem {subsystem8}: Serial number {serial} is already used by subsystem {subsystem2}" in caplog.text + caplog.clear() subs_list = cli_test(["subsystem", "list"]) assert subs_list != None assert subs_list.status == 0 diff --git a/tests/test_cli_change_lb.py b/tests/test_cli_change_lb.py index 07cebb43..6e83d179 100755 --- a/tests/test_cli_change_lb.py +++ b/tests/test_cli_change_lb.py @@ -124,10 +124,10 @@ def change_one_namespace_lb_group(caplog, subsys, nsid_to_change, new_group): time.sleep(8) assert f"Changing load balancing group of namespace {nsid_to_change} in {subsys} to {new_group}: Successful" in caplog.text - assert f"Received auto False request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: