Merge pull request #2474 from yuvipanda/jmte-hub

Integrate JMTE hub to our existing infrastructure
2i2c-org · Apr 11, 2023 · 21f6d35 · 21f6d35
2 parents 3e1460f + 3a4ea7e
commit 21f6d35
Show file tree

Hide file tree

Showing 14 changed files with 776 additions and 2 deletions.
diff --git a/config/clusters/jupyter-meets-the-earth/cluster.yaml b/config/clusters/jupyter-meets-the-earth/cluster.yaml
@@ -0,0 +1,27 @@
+name: jupyter-meets-the-earth
+provider: aws #  https://286354552638.signin.aws.amazon.com/console
+aws:
+  key: enc-deployer-credentials.secret.json
+  clusterType: eks
+  clusterName: jupyter-meets-the-earth
+  region: us-west-2
+support:
+  helm_chart_values_files:
+    - support.values.yaml
+    - enc-support.secret.values.yaml
+hubs:
+  - name: staging
+    domain: staging.jmte.2i2c.cloud
+    helm_chart: daskhub
+    helm_chart_values_files:
+      - common.values.yaml
+      - staging.values.yaml
+      - enc-staging.secret.values.yaml
+  - name: prod
+    display_name: "Jupyter Meets the Earth"
+    domain: jmte.2i2c.cloud
+    helm_chart: daskhub
+    helm_chart_values_files:
+      - common.values.yaml
+      - prod.values.yaml
+      - enc-prod.secret.values.yaml
diff --git a/config/clusters/jupyter-meets-the-earth/common.values.yaml b/config/clusters/jupyter-meets-the-earth/common.values.yaml
@@ -0,0 +1,319 @@
+basehub:
+  nfs:
+    pv:
+      # from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
+      mountOptions:
+        - rsize=1048576
+        - wsize=1048576
+        - timeo=600
+        - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
+        - retrans=2
+        - noresvport
+      serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
+      # This is different from rest of our hubs!
+      baseShareName: /
+
+  jupyterhub:
+    custom:
+      homepage:
+        templateVars:
+          org:
+            name: Jupyter meets the Earth
+            logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png
+            url: https://jupytearth.org
+          designed_by:
+            name: 2i2c
+            url: https://2i2c.org
+          operated_by:
+            name: 2i2c
+            url: https://2i2c.org
+          funded_by:
+            name: Jupyter meets the Earth
+            url: https://jupytearth.org
+
+    singleuser:
+      extraFiles:
+        jupyter_server_config.json:
+          mountPath: /etc/jupyter/jupyter_notebook_config.json
+          data:
+            # Allow jupyterlab option to show hidden files in browser
+            # https://github.com/berkeley-dsep-infra/datahub/issues/3160
+            ContentsManager:
+              allow_hidden: true
+      initContainers:
+        # Need to explicitly set this up and copy what's in basehub/values.yaml
+        # as we have an extra 'shared-public' directory here.
+        - name: volume-mount-ownership-fix
+          image: busybox
+          command:
+            [
+              "sh",
+              "-c",
+              "id && chown 1000:1000 /home/jovyan /home/jovyan/shared /home/jovyan/shared-public && ls -lhd /home/jovyan",
+            ]
+          securityContext:
+            runAsUser: 0
+          volumeMounts:
+            - name: home
+              mountPath: /home/jovyan
+              subPath: "{username}"
+            - name: home
+              mountPath: /home/jovyan/shared
+              subPath: _shared
+            - name: home
+              mountPath: /home/jovyan/shared-public
+              subPath: _shared_public
+
+      # /dev/shm is mounted as a filesystem path, where writing to it means to
+      # write to memory.
+      #
+      # How to: https://stackoverflow.com/questions/46085748/define-size-for-dev-shm-on-container-engine/46434614#46434614
+      # Request for this by Ellie: https://fperezgroup.slack.com/archives/C020XCEFPEH/p1658168872788389
+      storage:
+        extraVolumes:
+          - name: dev-shm
+            emptyDir:
+              medium: Memory
+        extraVolumeMounts:
+          - name: dev-shm
+            mountPath: /dev/shm
+          # FIXME: we override the list extraVolumeMounts which is also set in
+          #        the the basehub chart, due to that, we need to add this here
+          #        as well. An option is to add hub.extraConfig entries that
+          #        append the kubespawner configuration to include these extra
+          #        volume mounts.
+          #
+          - name: home
+            mountPath: /home/jovyan/shared
+            subPath: _shared
+            readOnly: true
+          - name: home
+            mountPath: /home/jovyan/shared-public
+            subPath: _shared_public
+
+      # Increased as we have experienced a too slow image pull at least
+      # once. Our pods can take ~6-7 minutes to start on a new node it
+      # seems, so this gives us some margin.
+      startTimeout: 1200
+
+      extraEnv:
+        GH_SCOPED_CREDS_APP_URL: https://github.com/apps/hub-jupytearth-org-github-integ
+        GH_SCOPED_CREDS_CLIENT_ID: Iv1.a073b1649637af12
+
+        # FIXME: Until we can set this just for the GPU nodes, we need to set it for everyon
+        NVIDIA_DRIVER_CAPABILITIES: compute,utility
+
+      image:
+        # NOTE: We use the jupyterhub-configurator so this image/tag is not
+        #       relevant. Visit its UI to configure the hub.
+        #
+        #       staging: https://staging.hub.jupytearth.org/services/configurator/
+        #       prod:    https://hub.jupytearth.org/services/configurator/
+        pullPolicy: Always
+        name: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env
+        tag: "latest"
+
+      profileList:
+        - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
+          default: True
+          description: "A shared machine, the recommended option until you experience a limitation."
+          kubespawner_override:
+            cpu_guarantee: 0.225
+            mem_guarantee: 0.875G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+        - display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
+          description: "A shared machine."
+          kubespawner_override:
+            cpu_guarantee: 0.875
+            mem_guarantee: 3.5G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+        - display_name: "Medium: 4 CPU, 16 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            cpu_guarantee: 3.5
+            mem_guarantee: 14G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+        - display_name: "Large: 16 CPU, 64 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 56G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.4xlarge
+        - display_name: "Massive: 64 CPU, 256 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 224G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.16xlarge
+        - display_name: "Massive high-memory: 64 CPU, 976 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 900G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: x1.16xlarge
+        - display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            cpu_guarantee: 3.5
+            mem_guarantee: 14G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+        - display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            mem_guarantee: 56G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.4xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+        - display_name: "Massive GPU: 64 CPU, 256 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            mem_guarantee: 224G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.16xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+        - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB - Test of latest image"
+          description: "Helps us test an image before we make it the default"
+          kubespawner_override:
+            image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:latest
+            image_pull_policy: Always
+            cpu_guarantee: 0.225
+            mem_guarantee: 0.875G
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+            mem_limit: null
+
+    hub:
+      config:
+        JupyterHub:
+          authenticator_class: cilogon
+        CILogonOAuthenticator:
+          scope:
+            - "profile"
+          username_claim: "preferred_username"
+          # Only show the option to login with GitHub
+          shown_idps:
+            - http://github.com/login/oauth/authorize
+        Authenticator:
+          allowed_users: &users
+            # This is just listing a few of the users/admins, a lot of
+            # users has been added manually, see:
+            # https://github.com/pangeo-data/jupyter-earth/issues/53
+            - abbyazari # Abby Azari
+            - andersy005 # Anderson Banihirwe
+            - consideratio # Erik Sundell
+            - choldgraf # Chris Holdgraf
+            - elliesch # Ellie Abrahams
+            - EMscience # Edom Moges
+            - espg # Shane Grigsby
+            - facusapienza21 # Facundo Sapienza
+            - fperez # Fernando Pérez
+            - kmpaul # Kevin Paul
+            - lrennels # Lisa Rennels
+            - mrsiegfried # Matthew Siegfried
+            - tsnow03 # Tasha Snow
+            - whyjz # Whyjay Zheng
+            - yuvipanda # Yuvi Panda
+            - jonathan-taylor # Jonathan Taylor
+          admin_users: *users
+      allowNamedServers: true
+
+dask-gateway:
+  gateway:
+    backend:
+      scheduler:
+        # IMPORTANT: We have experienced that the scheduler can fail with
+        #            1GB memory limit. This was observed "stream closed"
+        #            from the python client working against the
+        #            Dask-Gateway created DaskCluster.
+        #
+        #            CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed
+        #
+        cores:
+          request: 1
+          limit: 64
+        memory:
+          request: 2G
+          limit: 500G
+
+    # Note that we are overriding options provided in 2i2c's helm chart that has
+    # default values for these config entries.
+    #
+    extraConfig:
+      # This configuration represents options that can be presented to users
+      # that want to create a Dask cluster using dask-gateway. For more
+      # details, see https://gateway.dask.org/cluster-options.html
+      #
+      # The goal is to provide a simple configuration that allow the user some
+      # flexibility while also fitting well well on AWS nodes that are all
+      # having 1:4 ratio between CPU and GB of memory. By providing the
+      # username label, we help administrators to track user pods.
+      option_handler: |
+        from dask_gateway_server.options import Options, Select, String, Mapping
+        def cluster_options(user):
+            def option_handler(options):
+                if ":" not in options.image:
+                    raise ValueError("When specifying an image you must also provide a tag")
+                extra_labels = {}
+                extra_annotations = {
+                    "prometheus.io/scrape": "true",
+                    "prometheus.io/port": "8787",
+                }
+                chosen_worker_cpu = int(options.worker_specification.split("CPU")[0])
+                chosen_worker_memory = 4 * chosen_worker_cpu
+                # We multiply the requests by a fraction to ensure that the
+                # worker fit well within a node that need some resources
+                # reserved for system pods.
+                return {
+                    # A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
+                    "image": options.image,
+                    "scheduler_extra_pod_labels": extra_labels,
+                    "scheduler_extra_pod_annotations": extra_annotations,
+                    "worker_extra_pod_labels": extra_labels,
+                    "worker_extra_pod_annotations": extra_annotations,
+                    "worker_cores": 0.85 * chosen_worker_cpu,
+                    "worker_cores_limit": chosen_worker_cpu,
+                    "worker_memory": "%fG" % (0.85 * chosen_worker_memory),
+                    "worker_memory_limit": "%fG" % chosen_worker_memory,
+                    "environment": options.environment,
+                }
+            return Options(
+                Select(
+                    "worker_specification",
+                    [
+                        "1CPU, 4GB",
+                        "2CPU, 8GB",
+                        "4CPU, 16GB",
+                        "8CPU, 32GB",
+                        "16CPU, 64GB",
+                        "32CPU, 128GB",
+                        "64CPU, 256GB",
+                    ],
+                    default="1CPU, 4GB",
+                    label="Worker specification",
+                ),
+                # The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
+                String("image", label="Image"),
+                Mapping("environment", {}, label="Environment variables"),
+                handler=option_handler,
+            )
+        c.Backend.cluster_options = cluster_options
+      idle: |
+        # timeout after 30 minutes of inactivity
+        c.KubeClusterConfig.idle_timeout = 1800
diff --git a/config/clusters/jupyter-meets-the-earth/enc-deployer-credentials.secret.json b/config/clusters/jupyter-meets-the-earth/enc-deployer-credentials.secret.json
@@ -0,0 +1,25 @@
+{
+	"AccessKey": {
+		"AccessKeyId": "ENC[AES256_GCM,data:A3+Abzcvq+I2hZq2u4coAYzNjvk=,iv:B4kPrUIM8nx/VTrEQI+tUxEySkDDe6eZHJqAJ9B4YcU=,tag:PtO2TdNEJsaYY0nQyvTHSw==,type:str]",
+		"SecretAccessKey": "ENC[AES256_GCM,data:gfFXGESHTJn6tiQUpMkpbpqNJJ43KxkNvYaH8V7sC5lRKUPl85Dw7w==,iv:krcKBzv/Wzu+jjtd9MJiTQvj6ELo2JHXird+mn0Vt5c=,tag:jv4YANW0drzpjpVekpmzqg==,type:str]",
+		"UserName": "ENC[AES256_GCM,data:8fWApCCT7IL+9E6t0FkRS3XTaHDL+XA=,iv:/rsHbqCvzulMvT6Jzj20zqfOb39ojUWprFbn8359ozA=,tag:Nc1L5ufStyZMOUxI8xVrzA==,type:str]"
+	},
+	"sops": {
+		"kms": null,
+		"gcp_kms": [
+			{
+				"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs",
+				"created_at": "2023-04-07T13:38:22Z",
+				"enc": "CiUA4OM7eGDmmlUnGoSPNr9unRpxJ7GEcQ5/pXY2SrvhODPp9JWFEkkALQgViOWaFqYsRFv2FP6kqShPvabRqOC6KTPai4WGjiuK10rHIgiBbGNAfwQdenfi/vBU3h0rslaKojCN2qO4H+TAb4LG7eyO"
+			}
+		],
+		"azure_kv": null,
+		"hc_vault": null,
+		"age": null,
+		"lastmodified": "2023-04-07T13:38:23Z",
+		"mac": "ENC[AES256_GCM,data:HD/8swJpKnpElskOZXFjkJW6SjTIKChIZtHTqqlYexrj1x/HqrkLaGdHAuWIijZ91SOjxWlQxY67RzbpiJgdxG7XUcokrHqs+mEaWV65XVS087jucZo2tVC86wBFwNe4smlAEj6AF8n2gq/UAQbWoBE4fo3Vm/ojzhStqlLL0aQ=,iv:rrI6EO+c1LONQAHbsG7/TfEGlrrlKfzuriO+g29DFno=,tag:ZJqRJHVKlXOI+5S6cpsFtg==,type:str]",
+		"pgp": null,
+		"unencrypted_suffix": "_unencrypted",
+		"version": "3.7.3"
+	}
+}
diff --git a/config/clusters/jupyter-meets-the-earth/enc-prod.secret.values.yaml b/config/clusters/jupyter-meets-the-earth/enc-prod.secret.values.yaml
@@ -0,0 +1,21 @@
+basehub:
+    jupyterhub:
+        hub:
+            config:
+                CILogonOAuthenticator:
+                    client_id: ENC[AES256_GCM,data:pSlKv7EOrxXkt8Rhr2g1AmzK8r5chvTQNF9fcQclAiFJ4D4zrsgrKIAIMfBzK1qZaPqR,iv:bhyj+ytwpxHcpG782gwhxZ4T9qBYRuLzXF4kIczoM8w=,tag:QW7BUb87yxBLgQL2LaBelw==,type:str]
+                    client_secret: ENC[AES256_GCM,data:DSoHfbfBHNroZ+c7+7BJIpUYfe5/RyuAj/uAgwCf51Q340WBow6X7nBSAKMTEgmzX2cOzYpjoWzQWXaM6IY21hmJJD9SjEJ1IY5kxmFqjahKGYvm9LM=,iv:Ze1SGQGcGw0mDQuoj7EVXnQTQAT/R/6T4/KH1n86orY=,tag:UfQQFNgUyyjcirl6ln52Sw==,type:str]
+sops:
+    kms: []
+    gcp_kms:
+        - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs
+          created_at: "2023-04-10T11:12:30Z"
+          enc: CiUA4OM7eITj8Go7yW8sabmF/ng6BAsGlrokT4eIYmff/PGx3HhhEkkALQgViBPoYsyjX82iMTE9MRaMY5Cp+4YwuGD9beVZCCRDZfyU9xkj+qHWH/6cr23FQ5iWmlmONLTeigXAWuV4nrGiQgqVAZuM
+    azure_kv: []
+    hc_vault: []
+    age: []
+    lastmodified: "2023-04-10T11:12:31Z"
+    mac: ENC[AES256_GCM,data:M/Q9+8zZjf4oX6WsAsXF/56WUrbce9QDEwYt6zWQ1yVZEujeIhmZ1Dw4OgEGeDGA8UdhGAlBH9hmpUtauQeMgXF3ajnO8S+5OscXsOueEAU8syNkGWpaI3r2U4ipm6ud76sf1juTPg7ia0wzLEb8kTx6qQvoNCyUYjB8Qoar42k=,iv:FiXQdIxf4sF06CWs5BctO47WvIe6bDy7rnmlbc1AlDc=,tag:NTAkSySc7Lsy4QhVVmPhMQ==,type:str]
+    pgp: []
+    unencrypted_suffix: _unencrypted
+    version: 3.7.3