2i2c-org · consideRatio · Mar 20, 2023 · Mar 19, 2023 · Mar 19, 2023 · Mar 19, 2023
diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml
@@ -56,22 +56,6 @@ basehub:
             - fperez
         JupyterHub:
           authenticator_class: github
-          # Announcement is a JupyterHub feature to present messages to users in
-          # web pages under the /hub path (JupyterHub responds), but not via the
-          # /user path (single-user server responds).
-          #
-          # ref: https://github.com/2i2c-org/infrastructure/issues/1501
-          # ref: https://jupyterhub.readthedocs.io/en/stable/reference/templates.html#announcement-configuration-variables
-          #
-          template_vars:
-            announcement: >-
-              <strong>
-              Service maintenance is scheduled Sunday March 19, to Monday 8AM
-              EST.
-              </strong>
-              <br/>
-              Running servers may be forcefully stopped and service disruption
-              is expected.
         GitHubOAuthenticator:
           # We are restricting profiles based on GitHub Team membership and
           # so need to populate the teams in the auth state
@@ -95,53 +79,194 @@ basehub:
             subPath: _shared
             readOnly: true
       profileList:
-        # The mem-guarantees are here so k8s doesn't schedule other pods
-        # on these nodes.
-        - display_name: "Small: m5.large"
-          description: "~2 CPU, ~8G RAM"
+        # NOTE: About node sharing
+        #
+        #       CPU/Memory requests/limits are actively considered still. This
+        #       profile list is setup to involve node sharing as considered in
+        #       https://github.com/2i2c-org/infrastructure/issues/2121.
+        #
+        #       - Memory requests are different from the description, based on:
+        #         whats found to remain allocate in k8s, subtracting 1GiB
+        #         overhead for misc system pods, and transitioning from GB in
+        #         description to GiB in mem_guarantee.
+        #       - CPU requests are lower than the description, with a factor of
+        #         10%.
+        #
+        - display_name: "Small: up to 4 CPU / 32 GB RAM"
+          description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type"
+          slug: small
           default: true
           allowed_teams:
             - 2i2c-org:hub-access-for-2i2c-staff
             - CryoInTheCloud:cryoclouduser
             - CryoInTheCloud:cryocloudadvanced
+          profile_options:
+            requests:
+              # NOTE: Node share choices are in active development, see comment
+              #       next to profileList: above.
+              display_name: Node share
+              choices:
+                mem_1:
+                  default: true
+                  display_name: ~1 GB, ~0.125 CPU
+                  kubespawner_override:
+                    mem_guarantee: 0.904G
+                    cpu_guarantee: 0.013
+                mem_2:
+                  display_name: ~2 GB, ~0.25 CPU
+                  kubespawner_override:
+                    mem_guarantee: 1.809G
+                    cpu_guarantee: 0.025
+                mem_4:
+                  display_name: ~4 GB, ~0.5 CPU
+                  kubespawner_override:
+                    mem_guarantee: 3.617G
+                    cpu_guarantee: 0.05
+                mem_8:
+                  display_name: ~8 GB, ~1.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 7.234G
+                    cpu_guarantee: 0.1
+                mem_16:
+                  display_name: ~16 GB, ~2.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 14.469G
+                    cpu_guarantee: 0.2
+                mem_32:
+                  display_name: ~32 GB, ~4.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 28.937G
+                    cpu_guarantee: 0.4
           kubespawner_override:
-            # Explicitly unset mem_limit, so it overrides the default memory limit we set in
-            # basehub/values.yaml
+            cpu_limit: null
             mem_limit: null
-            mem_guarantee: 6.5G
             node_selector:
-              node.kubernetes.io/instance-type: m5.large
-        - display_name: "Medium: m5.xlarge"
-          description: "~4 CPU, ~15G RAM"
-          allowed_teams:
-            - 2i2c-org:hub-access-for-2i2c-staff
-            - CryoInTheCloud:cryoclouduser
-            - CryoInTheCloud:cryocloudadvanced
-          kubespawner_override:
-            mem_limit: null
-            mem_guarantee: 12G
-            node_selector:
-              node.kubernetes.io/instance-type: m5.xlarge
-        - display_name: "Large: m5.2xlarge"
-          description: "~8 CPU, ~30G RAM"
-          allowed_teams:
-            - 2i2c-org:hub-access-for-2i2c-staff
-            - CryoInTheCloud:cryocloudadvanced
-          kubespawner_override:
-            mem_limit: null
-            mem_guarantee: 26G
-            node_selector:
-              node.kubernetes.io/instance-type: m5.2xlarge
-        - display_name: "Huge: m5.8xlarge"
-          description: "~32 CPU, ~128G RAM"
+              node.kubernetes.io/instance-type: r5.xlarge
+
+        - display_name: "Medium: up to 16 CPU / 128 GB RAM"
+          description: *profile_list_description
+          slug: medium
           allowed_teams:
             - 2i2c-org:hub-access-for-2i2c-staff
             - CryoInTheCloud:cryocloudadvanced
+          profile_options:
+            requests:
+              # NOTE: Node share choices are in active development, see comment
+              #       next to profileList: above.
+              display_name: Node share
+              choices:
+                mem_1:
+                  display_name: ~1 GB, ~0.125 CPU
+                  kubespawner_override:
+                    mem_guarantee: 0.942G
+                    cpu_guarantee: 0.013
+                mem_2:
+                  display_name: ~2 GB, ~0.25 CPU
+                  kubespawner_override:
+                    mem_guarantee: 1.883G
+                    cpu_guarantee: 0.025
+                mem_4:
+                  default: true
+                  display_name: ~4 GB, ~0.5 CPU
+                  kubespawner_override:
+                    mem_guarantee: 3.766G
+                    cpu_guarantee: 0.05
+                mem_8:
+                  display_name: ~8 GB, ~1.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 7.532G
+                    cpu_guarantee: 0.1
+                mem_16:
+                  display_name: ~16 GB, ~2.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 15.064G
+                    cpu_guarantee: 0.2
+                mem_32:
+                  display_name: ~32 GB, ~4.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 30.128G
+                    cpu_guarantee: 0.4
+                mem_64:
+                  display_name: ~64 GB, ~8.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 60.257G
+                    cpu_guarantee: 0.8
+                mem_128:
+                  display_name: ~128 GB, ~16.0 CPU
+                  kubespawner_override:
+                    mem_guarantee: 120.513G
+                    cpu_guarantee: 1.6
           kubespawner_override:
+            cpu_limit: null
             mem_limit: null
-            mem_guarantee: 115G
             node_selector:
-              node.kubernetes.io/instance-type: m5.8xlarge
+              node.kubernetes.io/instance-type: r5.4xlarge
+
+        # NOTE: The large option is added as a comment for now. It may be that
+        #       its relevant in the future for advanced users having a workshop,
+        #       and then its possible to enable more easily.
+        #
+        #       This setup was discussed with Tasha Snow in March 2023 at
+        #       https://2i2c.freshdesk.com/a/tickets/543.
+        #
+        # - display_name: "Large: up to 64 CPU / 512 GB RAM"
+        #   description: *profile_list_description
+        #   slug: large
+        #   allowed_teams:
+        #     - 2i2c-org:hub-access-for-2i2c-staff
+        #     - CryoInTheCloud:cryocloudadvanced
+        #   profile_options:
+        #     requests:
+        #       # NOTE: Node share choices are in active development, see comment
+        #       #       next to profileList: above.
+        #       display_name: Node share
+        #       choices:
+        #         mem_4:
+        #           display_name: ~4 GB, ~0.5 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 3.821G
+        #             cpu_guarantee: 0.05
+        #         mem_8:
+        #           display_name: ~8 GB, ~1.0 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 7.643G
+        #             cpu_guarantee: 0.1
+        #         mem_16:
+        #           default: true
+        #           display_name: ~16 GB, ~2.0 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 15.285G
+        #             cpu_guarantee: 0.2
+        #         mem_32:
+        #           display_name: ~32 GB, ~4.0 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 30.571G
+        #             cpu_guarantee: 0.4
+        #         mem_64:
+        #           display_name: ~64 GB, ~8.0 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 61.141G
+        #             cpu_guarantee: 0.8
+        #         mem_128:
+        #           display_name: ~128 GB, ~16.0 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 122.282G
+        #             cpu_guarantee: 1.6
+        #         mem_256:
+        #           display_name: ~256 GB, ~32.0 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 244.565G
+        #             cpu_guarantee: 3.2
+        #         mem_512:
+        #           display_name: ~512 GB, ~64.0 CPU
+        #           kubespawner_override:
+        #             mem_guarantee: 489.13G
+        #             cpu_guarantee: 6.4
+        #   kubespawner_override:
+        #     cpu_limit: null
+        #     mem_limit: null
+        #     node_selector:
+        #       node.kubernetes.io/instance-type: r5.16xlarge
     scheduling:
       userScheduler:
         enabled: true
diff --git a/eksctl/nasa-cryo.jsonnet b/eksctl/nasa-cryo.jsonnet
@@ -25,10 +25,9 @@ local nodeAz = "us-west-2a";
 // A `node.kubernetes.io/instance-type label is added, so pods
 // can request a particular kind of node with a nodeSelector
 local notebookNodes = [
-    { instanceType: "m5.large" },
-    { instanceType: "m5.xlarge" },
-    { instanceType: "m5.2xlarge" },
-    { instanceType: "m5.8xlarge" },
+    { instanceType: "r5.xlarge" },
+    { instanceType: "r5.4xlarge" },
+    { instanceType: "r5.16xlarge" },
 ];
 
 local daskNodes = [
@@ -38,10 +37,7 @@ local daskNodes = [
     // *first* item in instanceDistribution.instanceTypes, to match
     // what we do with notebook nodes. Pods can request a particular
     // kind of node with a nodeSelector
-    { instancesDistribution+: { instanceTypes: ["m5.large"] }},
-    { instancesDistribution+: { instanceTypes: ["m5.xlarge"] }},
-    { instancesDistribution+: { instanceTypes: ["m5.2xlarge"] }},
-    { instancesDistribution+: { instanceTypes: ["m5.8xlarge"] }},
+    { instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }},
 ];
 
 
@@ -51,7 +47,7 @@ local daskNodes = [
     metadata+: {
         name: "nasa-cryo",
         region: clusterRegion,
-        version: '1.22'
+        version: '1.25'
     },
     availabilityZones: masterAzs,
     iam: {
@@ -83,7 +79,7 @@ local daskNodes = [
             ssh: {
                 publicKeyPath: 'ssh-keys/nasa-cryo.key.pub'
             },
-            instanceType: "m5.xlarge",
+            instanceType: "r5.xlarge",
             minSize: 1,
             maxSize: 6,
             labels+: {
@@ -138,6 +134,4 @@ local daskNodes = [
             },
         } + n for n in daskNodes
     ]
-
-
-}
+}