From b1cc0af847baaaf79387ddd25051f9c333203a28 Mon Sep 17 00:00:00 2001
From: Gil Bregman <gbregman@il.ibm.com>
Date: Sun, 22 Dec 2024 11:17:58 +0200
Subject: [PATCH] Do not process gRPC calls when gateway is going down. Fixes
 #992

Signed-off-by: Gil Bregman <gbregman@il.ibm.com>
---
 .env                           |  2 ++
 ceph-nvmeof.conf               |  2 +-
 control/grpc.py                | 14 +++++++++++---
 control/prometheus.py          |  2 +-
 control/server.py              |  2 ++
 control/state.py               |  2 +-
 tests/ceph-nvmeof.no-huge.conf |  5 ++++-
 tests/ceph-nvmeof.tls.conf     |  5 ++++-
 tests/test_cli.py              |  8 ++++++++
 tests/test_cli_change_lb.py    |  4 ++--
 10 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/.env b/.env
index 968097cd..57b848db 100644
--- a/.env
+++ b/.env
@@ -101,3 +101,5 @@ DHCHAP_KEY6="DHHC-1:01:Bu4tZd7X2oW7XxmVH5tGCdoS30pDX6bZvexHYoudeVlJW9yz:"
 DHCHAP_KEY7="DHHC-1:01:JPJkDQ2po2FfLmKYlTF/sJ2HzVO/FKWxgXKE/H6XfL8ogQ1T:"
 DHCHAP_KEY8="DHHC-1:01:e0B0vDxKleDzYVtG42xqFvoWZfiufkoywmfRKrETzayRdf1j:"
 DHCHAP_KEY9="DHHC-1:01:KD+sfH3/o2bRQoV0ESjBUywQlMnSaYpZISUbVa0k0nsWpNST:"
+DHCHAP_KEY10="DHHC-1:00:rWf0ZFYO7IgWGttM8w6jUrAY4cTQyqyXPdmxHeOSve3w5QU9:"
+DHCHAP_KEY11="DHHC-1:02:j3uUz05r5aQy42vX4tDXqVf9HgUPPdEp3kXTgUWl9EphsG7jwpr9KSIt3bmRLXBijPTIDQ==:"
diff --git a/ceph-nvmeof.conf b/ceph-nvmeof.conf
index ea3c9c08..86875fe3 100644
--- a/ceph-nvmeof.conf
+++ b/ceph-nvmeof.conf
@@ -32,7 +32,7 @@ max_ns_to_change_lb_grp = 8
 #verify_nqns = True
 #allowed_consecutive_spdk_ping_failures = 1
 #spdk_ping_interval_in_seconds = 2.0
-#max_hosts_per_namespace = 1
+#max_hosts_per_namespace = 8
 #max_namespaces_with_netmask = 1000
 #max_subsystems = 128
 #max_namespaces = 1024
diff --git a/control/grpc.py b/control/grpc.py
index 4261d2bc..9d277071 100644
--- a/control/grpc.py
+++ b/control/grpc.py
@@ -362,7 +362,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp
             self.host_name = socket.gethostname()
         self.verify_nqns = self.config.getboolean_with_default("gateway", "verify_nqns", True)
         self.gateway_group = self.config.get_with_default("gateway", "group", "")
-        self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 1)
+        self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 8)
         self.max_namespaces_with_netmask = self.config.getint_with_default("gateway", "max_namespaces_with_netmask", 1000)
         self.max_subsystems = self.config.getint_with_default("gateway", "max_subsystems", GatewayService.MAX_SUBSYSTEMS_DEFAULT)
         self.max_namespaces = self.config.getint_with_default("gateway", "max_namespaces", GatewayService.MAX_NAMESPACES_DEFAULT)
@@ -386,6 +386,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp
         self._init_cluster_context()
         self.subsys_max_ns = {}
         self.host_info = SubsystemHostAuth()
+        self.up_and_running = True
         self.rebalance = Rebalance(self)
         
     def get_directories_for_key_file(self, key_type : str, subsysnqn : str, create_dir : bool = False) -> []:
@@ -668,6 +669,12 @@ def execute_grpc_function(self, func, request, context):
            called might take OMAP lock internally, however does NOT ensure
            taking OMAP lock in any way.
         """
+
+        if not self.up_and_running:
+            errmsg = "Gateway is going down"
+            self.logger.error(errmsg)
+            return pb2.req_status(status=errno.ESHUTDOWN, error_message=errmsg)
+
         return self.omap_lock.execute_omap_locking_function(self._grpc_function_with_lock, func, request, context)
 
     def create_bdev(self, anagrp: int, name, uuid, rbd_pool_name, rbd_image_name, block_size, create_image, rbd_image_size, context, peer_msg = ""):
@@ -988,7 +995,7 @@ def create_subsystem_safe(self, request, context):
                 else:
                     subsys_using_serial = self.serial_number_already_used(context, request.serial_number)
                     if subsys_using_serial:
-                        errmsg = f"Serial number {request.serial_number} already used by subsystem {subsys_using_serial}"
+                        errmsg = f"Serial number {request.serial_number} is already used by subsystem {subsys_using_serial}"
                 if subsys_already_exists or subsys_using_serial:
                     errmsg = f"{create_subsystem_error_prefix}: {errmsg}"
                     self.logger.error(f"{errmsg}")
@@ -1527,7 +1534,8 @@ def namespace_change_load_balancing_group_safe(self, request, context):
         grps_list = []
         peer_msg = self.get_peer_message(context)
         change_lb_group_failure_prefix = f"Failure changing load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn}"
-        self.logger.info(f"Received auto {request.auto_lb_logic} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}")
+        auto_lb_msg = "auto" if request.auto_lb_logic else "manual"
+        self.logger.info(f"Received {auto_lb_msg} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}")
 
         if not request.subsystem_nqn:
             errmsg = f"Failure changing load balancing group for namespace, missing subsystem NQN"
diff --git a/control/prometheus.py b/control/prometheus.py
index ec2fa115..411b6057 100644
--- a/control/prometheus.py
+++ b/control/prometheus.py
@@ -109,7 +109,7 @@ def __init__(self, spdk_rpc_client, config, gateway_rpc):
         self.gw_config = config
         _bdev_pools = config.get_with_default('gateway', 'prometheus_bdev_pools', '')
         self.bdev_pools = _bdev_pools.split(',') if _bdev_pools else []
-        self.interval = config.getint_with_default('gateway', 'prometheus_stats_inteval', 10)
+        self.interval = config.getint_with_default('gateway', 'prometheus_stats_interval', 10)
         self.lock = threading.Lock()
         self.hostname = os.getenv('NODE_NAME') or os.getenv('HOSTNAME')
 
diff --git a/control/server.py b/control/server.py
index 39c9279e..f1e889a1 100644
--- a/control/server.py
+++ b/control/server.py
@@ -125,6 +125,8 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         """Cleans up SPDK and server instances."""
+        if self.gateway_rpc:
+            self.gateway_rpc.up_and_running = False
         if exc_type is not None:
             self.logger.exception("GatewayServer exception occurred:")
         else:
diff --git a/control/state.py b/control/state.py
index 27e5d523..90837165 100644
--- a/control/state.py
+++ b/control/state.py
@@ -602,7 +602,7 @@ def cleanup_omap(self, omap_lock = None):
         if omap_lock and omap_lock.omap_file_lock_duration > 0:
             try:
                 omap_lock.unlock_omap()
-            except Exceprion:
+            except Exception:
                 pass
         if self.ioctx:
             try:
diff --git a/tests/ceph-nvmeof.no-huge.conf b/tests/ceph-nvmeof.no-huge.conf
index 55665105..d5b62aaf 100644
--- a/tests/ceph-nvmeof.no-huge.conf
+++ b/tests/ceph-nvmeof.no-huge.conf
@@ -17,6 +17,9 @@ state_update_notify = True
 state_update_timeout_in_msec = 2000
 state_update_interval_sec = 5
 enable_spdk_discovery_controller = False
+rebalance_period_sec = 7
+max_gws_in_grp = 16
+max_ns_to_change_lb_grp = 8
 #omap_file_lock_duration = 20
 #omap_file_lock_retries = 30
 #omap_file_lock_retry_sleep_interval = 1.0
@@ -29,7 +32,7 @@ enable_spdk_discovery_controller = False
 #verify_nqns = True
 #allowed_consecutive_spdk_ping_failures = 1
 #spdk_ping_interval_in_seconds = 2.0
-#max_hosts_per_namespace = 1
+#max_hosts_per_namespace = 8
 #max_namespaces_with_netmask = 1000
 #max_subsystems = 128
 #max_namespaces = 1024
diff --git a/tests/ceph-nvmeof.tls.conf b/tests/ceph-nvmeof.tls.conf
index 14be0cc3..42334e0b 100644
--- a/tests/ceph-nvmeof.tls.conf
+++ b/tests/ceph-nvmeof.tls.conf
@@ -16,6 +16,9 @@ enable_auth = True
 state_update_notify = True
 state_update_interval_sec = 5
 enable_spdk_discovery_controller = False
+rebalance_period_sec = 7
+max_gws_in_grp = 16
+max_ns_to_change_lb_grp = 8
 #omap_file_lock_duration = 20
 #omap_file_lock_retries = 30
 #omap_file_lock_retry_sleep_interval = 1.0
@@ -28,7 +31,7 @@ enable_spdk_discovery_controller = False
 #verify_nqns = True
 #allowed_consecutive_spdk_ping_failures = 1
 #spdk_ping_interval_in_seconds = 2.0
-#max_hosts_per_namespace = 1
+#max_hosts_per_namespace = 8
 #max_namespaces_with_netmask = 1000
 #max_subsystems = 128
 #max_namespaces = 1024
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 8559b8d4..e2dfc2d2 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -32,6 +32,7 @@
 subsystem5 = "nqn.2016-06.io.spdk:cnode5"
 subsystem6 = "nqn.2016-06.io.spdk:cnode6"
 subsystem7 = "nqn.2016-06.io.spdk:cnode7"
+subsystem8 = "nqn.2016-06.io.spdk:cnode8"
 discovery_nqn = "nqn.2014-08.org.nvmexpress.discovery"
 serial = "Ceph00000000000001"
 uuid = "948878ee-c3b2-4d58-a29b-2cff713fc02d"
@@ -70,6 +71,7 @@ def gateway(config):
     port = config.getint("gateway", "port")
     config.config["gateway"]["group"] = group_name
     config.config["gateway"]["max_namespaces_with_netmask"] = "3"
+    config.config["gateway"]["max_hosts_per_namespace"] = "1"
     config.config["gateway"]["max_subsystems"] = "3"
     config.config["gateway"]["max_namespaces"] = "12"
     config.config["gateway"]["max_namespaces_per_subsystem"] = "11"
@@ -212,6 +214,9 @@ def test_create_subsystem(self, caplog, gateway):
         assert f'"nqn": "{subsystem}"' in caplog.text
         assert f'"max_namespaces": 2049' in caplog.text
         caplog.clear()
+        cli(["subsystem", "add", "--subsystem", subsystem, "--max-namespaces", "2049", "--no-group-append"])
+        assert f"Failure creating subsystem {subsystem}: Subsystem already exists" in caplog.text
+        caplog.clear()
         cli(["subsystem", "add", "--subsystem", subsystem2, "--serial-number", serial, "--no-group-append"])
         assert f"Adding subsystem {subsystem2}: Successful" in caplog.text
         caplog.clear()
@@ -251,6 +256,9 @@ def test_create_subsystem(self, caplog, gateway):
         assert subs_list.subsystems[0].nqn == subsystem
         assert subs_list.subsystems[1].nqn == subsystem2
         caplog.clear()
+        cli(["subsystem", "add", "--subsystem", subsystem8, "--serial-number", serial, "--no-group-append"])
+        assert f"Failure creating subsystem {subsystem8}: Serial number {serial} is already used by subsystem {subsystem2}" in caplog.text
+        caplog.clear()
         subs_list = cli_test(["subsystem", "list"])
         assert subs_list != None
         assert subs_list.status == 0
diff --git a/tests/test_cli_change_lb.py b/tests/test_cli_change_lb.py
index 07cebb43..6e83d179 100755
--- a/tests/test_cli_change_lb.py
+++ b/tests/test_cli_change_lb.py
@@ -124,10 +124,10 @@ def change_one_namespace_lb_group(caplog, subsys, nsid_to_change, new_group):
         time.sleep(8)
 
     assert f"Changing load balancing group of namespace {nsid_to_change} in {subsys} to {new_group}: Successful" in caplog.text
-    assert f"Received auto False request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: <grpc._server" in caplog.text
+    assert f"Received manual request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: <grpc._server" in caplog.text
     assert f"Received request to delete namespace" not in caplog.text
     assert f"Received request to add a namespace" not in caplog.text
-    assert f"Received auto False request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: None" in caplog.text
+    assert f"Received manual request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: None" in caplog.text
 
 def switch_namespaces_lb_group(caplog, ns_count, subsys):
     for i in range(1, 1 + (ns_count // 2)):