diff --git a/Makefile b/Makefile index 15c9a2c3..94c1eee1 100644 --- a/Makefile +++ b/Makefile @@ -256,6 +256,10 @@ open-project-docs : open project-docs/_build/html/index.html || \ xdg-open project-docs/_build/html/index.html +clean-project-docs: + rm -rf project-docs/venv + rm -rf project-docs/_build + deploy-workshop: kubectl apply -f https://github.com/vmware-tanzu-labs/lab-k8s-fundamentals/releases/download/5.0/workshop.yaml kubectl apply -f https://github.com/vmware-tanzu-labs/lab-k8s-fundamentals/releases/download/5.0/trainingportal.yaml @@ -283,6 +287,7 @@ prune-builds: rm -rf training-portal/venv rm -rf client-programs/bin rm -rf client-programs/pkg/renderer/files + rm -rf project-docs/venv rm -rf project-docs/_build prune-registry: diff --git a/NOTICE b/NOTICE index a31a77d9..63d5df39 100644 --- a/NOTICE +++ b/NOTICE @@ -1,4 +1,4 @@ -Copyright 2020-2023 The Educates Authors. +Copyright 2020-2024 The Educates Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-programs/go.mod b/client-programs/go.mod index 77f3cf14..d6c06981 100644 --- a/client-programs/go.mod +++ b/client-programs/go.mod @@ -147,7 +147,7 @@ require ( golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.16.1 // indirect google.golang.org/appengine v1.6.8 // indirect - google.golang.org/protobuf v1.31.0 // indirect + google.golang.org/protobuf v1.33.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/cli-runtime v0.29.0 // indirect diff --git a/client-programs/go.sum b/client-programs/go.sum index 798e1239..57bf8937 100644 --- a/client-programs/go.sum +++ b/client-programs/go.sum @@ -449,8 +449,8 @@ google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzi google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= diff --git a/project-docs/conf.py b/project-docs/conf.py index 8d4c95bd..7896e42e 100644 --- a/project-docs/conf.py +++ b/project-docs/conf.py @@ -18,7 +18,7 @@ # -- Project information ----------------------------------------------------- project = 'Educates' -copyright = '2020-2023 The Educates Authors' +copyright = '2020-2024 The Educates Authors' author = 'Graham Dumpleton' diff --git a/project-docs/custom-resources/workshop-definition.md b/project-docs/custom-resources/workshop-definition.md index 968de588..863e39f3 100644 --- a/project-docs/custom-resources/workshop-definition.md +++ b/project-docs/custom-resources/workshop-definition.md @@ -566,6 +566,7 @@ spec: Note that the ability to override environment variables using this field should be limited to cases where they are required for the workshop. If you want to set or override an environment for a specific workshop environment, use the ability to set environment variables in the ``WorkshopEnvironment`` custom resource for the workshop environment instead. +(overriding-the-memory-available)= Overriding the memory available ------------------------------- @@ -629,6 +630,7 @@ In addition to secrets and configmaps these can be used to mount different types Note that ``volumeMounts`` are only added to the main workshop container. If mounting of a volume into a side car container was necessary for some purpose, then ``patches`` would need to be used to apply a patch against the complete workshop pod spec. +(resource-budget-for-namespaces)= Resource budget for namespaces ------------------------------ @@ -952,6 +954,7 @@ This can be added after an existing ``ytt`` or ``helmTemplate`` section under `` The purpose of the overlay is to set the owner of all resources generated to be the ``WorkshopSession`` resource for the workshop session. This will ensure that any resources will be automatically deleted when the workshop session is deleted, without relying on ``kapp-controller`` to clean them up. +(overriding-default-rbac-rules)= Overriding default RBAC rules ----------------------------- @@ -1047,6 +1050,7 @@ spec: Note that previously one would patch the workshop pod template and set ``automountServiceAccountToken`` to ``false``. That method no longer works as how the access token is mounted into the workshop container is now handled differently. +(running-user-containers-as-root)= Running user containers as root ------------------------------- @@ -1066,6 +1070,7 @@ spec: This setting applies to the primary session namespace and any secondary namespaces that may be created. +(creating-additional-namespaces)= Creating additional namespaces ------------------------------ @@ -1740,6 +1745,7 @@ spec: As you don't have access to the workshop files in the init container, you will need to either use a custom container image, or inject a script into the init container using a secret and a volume mount. +(patching-workshop-deployment)= Patching workshop deployment ---------------------------- diff --git a/project-docs/index.rst b/project-docs/index.rst index c91da26c..d4566ee1 100644 --- a/project-docs/index.rst +++ b/project-docs/index.rst @@ -45,6 +45,12 @@ Educates workshop-content/building-an-image workshop-content/working-on-content +.. toctree:: + :maxdepth: 2 + :caption: Production Guides: + + production-guides/review-guidelines + .. toctree:: :maxdepth: 2 :caption: Custom Resources: diff --git a/project-docs/installation-guides/configuration-settings.md b/project-docs/installation-guides/configuration-settings.md index a7e3bf73..3400521f 100644 --- a/project-docs/installation-guides/configuration-settings.md +++ b/project-docs/installation-guides/configuration-settings.md @@ -309,6 +309,7 @@ eth0 Link encap:Ethernet HWaddr 02:42:AC:11:00:07 If the ``MTU`` size is less than 1400, then use the value given, or a smaller value, for the ``dockerd.mtu`` setting. +(image-registry-pull-through-cache)= Image registry pull through cache --------------------------------- diff --git a/project-docs/production-guides/review-guidelines.rst b/project-docs/production-guides/review-guidelines.rst new file mode 100644 index 00000000..e4a43c9e --- /dev/null +++ b/project-docs/production-guides/review-guidelines.rst @@ -0,0 +1,21 @@ +Review Guidelines +================= + +Before a workshop is deployed in a production environment for others to use it should be reviewed to ensure it follows best practices, will not consume excessive amounts of resources, and does not create any undue security risks. + +The follow guidelines are intended to help reviewers ensure that the workshop is ready for deployment. + +.. toctree:: + :maxdepth: 2 + + review-guidelines/changes-to-the-security-policy + review-guidelines/disabling-kubernetes-access + review-guidelines/workshop-user-admin-access + review-guidelines/workshop-container-memory + review-guidelines/workshop-container-cpu + review-guidelines/workshop-container-storage + review-guidelines/namespace-resource-budget + review-guidelines/workshop-container-startup + review-guidelines/docker-resource-requirements + review-guidelines/docker-container-image-registry + review-guidelines/hosting-images-on-docker-hub diff --git a/project-docs/production-guides/review-guidelines/changes-to-the-security-policy.md b/project-docs/production-guides/review-guidelines/changes-to-the-security-policy.md new file mode 100644 index 00000000..5d2ad105 --- /dev/null +++ b/project-docs/production-guides/review-guidelines/changes-to-the-security-policy.md @@ -0,0 +1,32 @@ +Changes to the security policy +============================== + +By default RBAC is set up for workshop session namespaces such that containers can only run as non root users due to the increased security risk of allowing workshop users to run anything as root. + +In an ideal world this would be fine, but too many container images out there don't follow best practices of running as a non root user and instead will only work if run as root. This is especially the case for official images from Docker Hub. + +As an example, the `nginx` image on Docker Hub, which is often used in example deployments for Kubernetes, will only run correctly if run as root. + +Workshops can override the default `restricted` security policy applied through RBAC by setting the `session.namespaces.security.policy` property. Currently this can be [set](running-user-containers-as-root) to the alternative of `baseline` to allow containers to be run as root. + +```yaml +spec: + session: + namespaces: + security: + policy: baseline +``` + +Overall the recommended solution is not to use any container image that requires it be run as root. In the case of `nginx` one can use the `bitnami/nginx` image instead. The Bitnami catalog provides various other images as well which can run as a non root user where the official Docker Hub images will only run as root. + +If absolutely necessary a workshop definition can specify its own pod security policies and bind those to a distinct service account used by a deployment to give more elevated privileges, including running as a privileged container, but this should be avoided due to the huge security risks with workshop users being able to create their own deployments using the service account and a malicious container image which could compromise the cluster. + +**Recommendations** + +* If possible workshops should never use container images that require they be run as root. +* Ensure that the security policy is not overridden to allow containers images to run as root if there is no need for it. +* Ensure that workshops don't give themselves elevated privileges outside of the `restricted` and `baseline` defaults, especially the ability to run privileged containers, due to the extreme security risks. + +**Related Issues** + +Note that when a Kubernetes virtual cluster is enabled for use with a workshop session, the security policy for the session namespace is automatically changed from `restricted` to `baseline`. This is necessary for the virtual cluster to run properly, but also means that any deployments made to the virtual cluster by a workshop user can run as the root user. diff --git a/project-docs/production-guides/review-guidelines/disabling-kubernetes-access.md b/project-docs/production-guides/review-guidelines/disabling-kubernetes-access.md new file mode 100644 index 00000000..c7f1b7bc --- /dev/null +++ b/project-docs/production-guides/review-guidelines/disabling-kubernetes-access.md @@ -0,0 +1,21 @@ +Disabling Kubernetes access +=========================== + +By default, a workshop session always provides access to a namespace in the Kubernetes cluster for that session. This is so that as part of a workshop deployments can be made into the Kubernetes cluster. Usually a workshop user would only have access to the single namespace created for that session. + +If the topic of a workshop is such that there is never a need for a user to be able to deploy anything to the Kubernetes cluster themselves, access to the Kubernetes REST API can be disabled for the workshop. + +This is done by [setting](blocking-access-to-kubernetes) `session.namespaces.security.token.enabled` to `false` in the workshop definition, and results in the service account token not being mounted into the workshop container. + +```yaml +spec: + session: + namespaces: + security: + token: + enabled: false +``` + +**Recommendations** + +* Ensure that access to the Kubernetes REST API is disabled if access is not required. diff --git a/project-docs/production-guides/review-guidelines/docker-container-image-registry.md b/project-docs/production-guides/review-guidelines/docker-container-image-registry.md new file mode 100644 index 00000000..b7457ceb --- /dev/null +++ b/project-docs/production-guides/review-guidelines/docker-container-image-registry.md @@ -0,0 +1,24 @@ +Docker container image registry +=============================== + +A workshop can enable the deployment of a container image registry per workshop session. This can be used to store container images built as part of the workshop instructions using `docker build`, `pack`, `kpack`, `kaniko` etc. The container images in the per session container image registry can then be deployed into the Kubernetes cluster. + +The container image registry uses a distinct deployment separate to the deployment of the workshop instance for the session. To stores images the per session container image registry is by default given a 5Gi persistent volume. The amount of memory set aside for the container image registry defaults to 768Mi. The amount of storage and memory can +be [overridden](enabling-session-image-registry) by setting properties under `session.applications.registry` in the workshop definition. + +```yaml +spec: + session: + applications: + registry: + enabled: true + memory: 1Gi + storage: 20Gi +``` + +**Recommendations** + +* Ensure that adequate memory is allocated to the per session container image registry. +* Ensure that adequate storage space is allocated to the per session container image registry. +* Ensure that the workshop doesn't encourage an ongoing repetitive process of building container images and pushing them to the container image registry as this will incrementally keep using more storage as no pruning is done of old image layers. +* If possible, rather than have deployments in the cluster directly reference container images on Docker Hub, have a user pull images from Docker Hub and push them to the per workshop session image registry and deploy the container image from that registry. At the same time, ensure that use of a Docker Hub mirror for each workshop environment is configured for Educates to avoid issues with possible rate limiting by Docker Hub. Alternatively look at using a shared OCI image cache with specific workshop environments to mirror required images. diff --git a/project-docs/production-guides/review-guidelines/docker-resource-requirements.md b/project-docs/production-guides/review-guidelines/docker-resource-requirements.md new file mode 100644 index 00000000..72fa09b0 --- /dev/null +++ b/project-docs/production-guides/review-guidelines/docker-resource-requirements.md @@ -0,0 +1,46 @@ +Docker resource requirements +============================ + +When docker support is enabled a dockerd instance (`dind` - docker in docker) is run as a side car container to the workshop container. When building and/or running docker images within the workshop, this is done inside of the context of this side car container and not in the main workshop container. This is the same if using tools such as `pack` which although it doesn't use the docker command, does connect to the dockerd instance running in the side car container to do the actual work. + +To ensure that building and/or running container images doesn't impact on the memory availability for the workshop container, the dockerd side car is given its own separate memory allocation. By default the amount of memory is 768Mi and this memory amount is guaranteed. If running docker builds which require a lot of memory (Java and NodeJS builds), you may need to increase the memory given to the dockerd side car container. This can be [overridden](enabling-ability-to-use-docker) in the workshop definition using the `session.applications.docker.memory` setting. + +```yaml +spec: + session: + applications: + docker: + enabled: true + memory: 1Gi +``` + +Because of possible large transient filesystem space requirements when building container images, as well as the space required to store the built container image, or any container image pulled down into the local dockerd instance, a persistent volume is always allocated for the dockerd container. This avoids the problem that you might run out of space on the Kubernetes node were lots of workshop instances running at the same time, but depending on how large the container images are, or how much transient space is required to build a container image, you may need to increase the amount of storage. By default the amount of storage given to dockerd is 5Gi. This can be [overridden](enabling-ability-to-use-docker) in the workshop definition using the `session.applications.docker.storage` setting. + +``` +spec: + session: + applications: + docker: + enabled: true + storage: 20Gi +``` + +**Recommendations** + +* Ensure that docker support is not enabled if not being used because of additional resource requirements but also the security risks implicit in allowing dockerd to be run. +* Ensure that adequate memory is allocated to the dockerd side car container. +* Ensure that adequate storage space is allocated to the dockerd side car container. +* Ensure that the number of container images pulled down using docker is limited to what is required. +* Ensure that you don't encourage running successive builds as each change will result in more container layers being stored. +* Ensure that users are asked to delete container images to free up storage space if they can, before proceeding with subsequent workshop steps, rather than just allocating more storage space. +* Ensure that users are asked to delete stopped containers to free up storage space if they can, before proceeding with subsequent workshop steps, rather than just allocating more storage space. Alternatively, use the `-rm` option to `docker run` to ensure that stopped containers are automatically deleted when they exit. + +**Related Issues** + +Note that memory and storage requirements for the dockerd side car container should be added to the memory and storage requirements of the main workshop container when determining resource requirements for each workshop instance. In the case of memory, since both containers are in the same pod, the combined memory requirement has to be available on a node before a workshop instance can be scheduled to the node. For storage, a separate persistent volume claim is used for the dockerd side car container, but because it is part of the same pod as the main workshop container, a node has to have available with enough persistent volume mount points to support the combined total number of persistent volumes used. + +Note that you cannot use storage space allocations that may work on local Kubernetes clusters using Kind or Minikube as a guide for how much you should use as the amount of storage requested in the case of those Kubernetes clusters is ignored and instead you can always use up to as much space as the VM or host system in which the Kubernetes cluster is running has for file system space. + +Note that enabling docker support is always a risk and the Kubernetes cluster can be readily compromised by a knowledgeable person intent on mischief because dockerd has to be run using a privileged container with the user then also being able to run containers using docker as privileged. If possible avoid using docker and use methods for building container images that don't rely on docker, such as kaniko and Java native systems for building container images. + + If the only reason for using docker is to run some services for each workshop session, instead try and run them as Kubernetes services. Alternatively, use the mechanism of the docker support to run services using a docker-compose configuration snippet. Using this method will by default result in the docker socket not being exposed inside of the workshop container thus reducing the security risk. diff --git a/project-docs/production-guides/review-guidelines/hosting-images-on-docker-hub.md b/project-docs/production-guides/review-guidelines/hosting-images-on-docker-hub.md new file mode 100644 index 00000000..521145ba --- /dev/null +++ b/project-docs/production-guides/review-guidelines/hosting-images-on-docker-hub.md @@ -0,0 +1,39 @@ +Hosting images on Docker Hub +============================ + +If a workshop uses an OCI image containing workshop instructions hosted on Docker Hub, or enables use of docker support and the workshop instructions have a user pull an image from Docker Hub in order to run it, or an image on Docker Hub is used in a docker build, because each workshop session results in a distinct pull of the image from Docker Hub, when running many workshop instances at the same time it is inevitable that you will hit the rate limiting imposed by Docker Hub. + +In the case of running docker images or using them in an image build, in order to avoid being rate limited it is necessary to configure the Educates system to mirror images pulled by dockerd by enabling an [image registry pull through cache](image-registry-pull-through-cache) for Docker Hub. This though is not something that can be configured in the workshop definition itself and needs to be enabled for the whole Educates system on that cluster. + +It is important that when setting up the mirror registry that you configure it with credentials for a Docker Hub account to ensure a greater rate limit is applied. Better still, ensure that the account is a paid account to access a greater limit again or that there is no limit at all depending on type of paid account. Anonymous access or a free account is not going to guarantee that you will not still be rate limited when many different workshops are running on the same Kubernetes cluster and they are all using container images from Docker Hub. + +When enabled, an image registry mirror will be setup for each distinct workshop where docker support is enabled. It is not a single mirror registry for all workshops on the cluster as it is too hard to determine how much storage space should be allocated for the mirror registry if there is just the one. + +For storage, the mirror registry for a specific workshop will be configured to use the same amount of storage that the dockerd side car container is configured to use, defaulting to 5Gi. For memory, the mirror registry for a specific workshop will be configured to the same amount of memory that the dockerd side car container is configured to use, defaulting to 768Mi. This is on the presumption that since images are being pulled down and will end up being stored in the storage associated with the dockerd side car container, that it should be enough for the mirror registry also. + +The mirror registry only covers images pulled through the dockerd side car container. It does not come into play if deployments to the Kubernetes cluster directly reference images from Docker Hub. In this case container images pulled from Docker Hub would get cached on the nodes in the cluster, but if the cluster as a whole is seen as a single IP address you are still likely to get rate limited when there are many workshops deployed to the one cluster. To avoid this problem, don't use container images from Docker Hub, or, enable docker support plus a per workshop session image registry. If workshop instructions have users use docker pull to pull the container image to the local dockerd, then push the container image to the per workshop session image registry. The deployment in Kubernetes should then be setup to reference the container image from the per workshop session image registry. By pulling down the image in this way, the image will be pulled through the mirror registry only once for that workshop and not for every session. + +A further problematic case for container images stored on Docker Hub is that of custom workshop images. As these are deployments to the Kubernetes cluster, they are pulled direct from Docker Hub and will be subject to rate limiting as well. Custom workshop images which reside on Docker Hub would ideally be re-hosted on a different image registry and used from there when referenced from a workshop definition. + +An alternative to using a mirror registry linked to dockerd, with everything happening transparently, is to configure a [shared OCI image cache](shared-oci-image-cache) local to the workshop environment to cache the custom workshop image, with it only needing to be pulled from Docker Hub the one time and all other pulls being to the local cache. This does mean adjusting image references to explicitly use the shared OCI image cache. The shared OCI image cache can also be used with other upstream image registries besides Docker Hub, however, mirroring of images from private upstream image registries, or where credentials are required such as is the case with Docker Hub to enable a higher pull limits, is not supported. A benefit of using the shared OCI image cache is however that it can be used with OCI images holding workshop instructions and is not limited to traditional docker images. + +Whether for running, in builds with dockerd, or for deployment to the cluster, any container images should always use an unchanging fixed version. Workshop instructions or deployment resources should never use versions tags such as `main`, `master`, `develop` or `latest`, or even version tags representing a major or major/minor version (without patch level), for images pulled from Docker Hub. This is because if the container image on Docker Hub is updated but uses the same version tag, the next pull of the image can result in a new version of the image being pulled down which because it is distinct from a prior image pulled to some degree, could result in the storage for the mirror registry or local OCI image cache being filled up. + +Similarly to the general case, if you can't avoid using Docker Hub for custom workshop images, you should avoid using `main`, `master`, `develop` and `latest` version tags for a custom workshop image. This is because doing so enables a mechanism in Educates which causes the image pull policy to be set to `Always`, meaning that an update to the image on Docker Hub would result in it being pulled down again. It is not known whether the check itself to see if there is a new version might constitute an image pull and count against the Docker Hub download count. + +**Recommendations** + +* Ensure that OCI images for workshop instructions are not being hosted on Docker Hub. +* Ensure that custom workshop images are not being hosted on Docker Hub. +* Ensure that support for an image registry pull through mirror is configured for the Educates system if container images are being used from Docker Hub. +* Ensure that when images are used from Docker Hub that a version tag is used for container images which guarantees that the image will not be updated. +* If possible, rather than have deployments in the cluster directly reference container images on Docker Hub, have a user pull images from Docker Hub and push them to the per workshop session image registry and deploy the container image from that registry. +* If possible just don't use container images from Docker Hub, or if can't avoid it and don't want to make users pull them down and push them to the per workshop session image registry, copy images from Docker Hub in advance to an alternate common image registry hosted in the same cluster, or elsewhere and modify workshop instructions or deployment resources to use the container images from that image registry instead. Where practical use a shared OCI image cache for the workshop environment for this purpose. + +**Related Issues** + +Note that if users venture beyond the workshop instructions and start pulling arbitrary container images from Docker Hub, it is possible that it can act as a denial of service attack against that workshop as storage in the mirror registry could be filled up. It is not known if the docker image registry in mirror mode will self prune images when storage is low, or whether it goes into a mode whereby it just passes requests through, thus being subject to rate limiting again. Worst case it may be necessary to manually access the pod for the mirror registry to perform commands or access REST APIs to delete images which were pulled which shouldn't be. + +Note that only docker pull via the dockerd side car container is going to make use of the mirror registry. Other tools that pull image themselves, such as `dive`, `skopeo` or `imgpkg` are not going to make use of the mirror registry and would still be subject to rate limiting if using container images from Docker Hub. This is why you should not store workshop instructions/files on Docker Hub as an OCI image artefact as `imgpkg` is used in that case to pull down the OCI image artefact and unpack it at the start of every workshop session. + +Note that you SHOULD NOT embed your own credentials for Docker Hub in a workshop definition and have users use that to login to Docker Hub to pull images as your credentials are obviously exposed and depending on access type they could login to the account, change the password, or otherwise cause problems. diff --git a/project-docs/production-guides/review-guidelines/namespace-resource-budget.md b/project-docs/production-guides/review-guidelines/namespace-resource-budget.md new file mode 100644 index 00000000..ebbfc561 --- /dev/null +++ b/project-docs/production-guides/review-guidelines/namespace-resource-budget.md @@ -0,0 +1,28 @@ +Namespace resource budget +========================= + +For each workshop session a namespace is created in the Kubernetes cluster for use by just that session. This is to allow a workshop to create deployments in the cluster using Kubernetes resources, tools etc. If more than one namespace is required, this can be configured through the workshop definition. + +By default Educates does not impose any restrictions on what resources can be consumed by applications deployed to the session namespace. Any limit will be whatever may be dictated by the Kubernetes cluster itself, which for standard Kubernetes cluster is no limits at all. This means a resource budget must be defined in the workshop definition for a workshop in order to limit how much resources can be used. This is done by [overriding](resource-budget-for-namespaces) the `session.namespaces.budget` property in the workshop definition. + +```yaml +spec: + session: + namespaces: + budget: medium +``` + +The value for the property is a t-shirt size which maps to a resource quota for the amount of memory and CPU which can be used by deployments in the session namespace, as well as dictating limit ranges and defaults for memory and CPU if deployments do not specify themselves what should be used. + +Individual defaults and limit ranges specified for containers by the t-shirt size can be overridden, but these cannot be changed so they fall outside of the bounds specified for the pod by the t-shirt size. It is best practice that any deployment to the session namespace set their own resource requirements in the deployment resource rather than falling back on the defaults imposed by the limit ranges. + +If you need more control over the resource quotas and limit ranges the budget would be set to be `custom` with you needing to provide `LimitRange` and `ResourceQuota` resources as part of `session.objects`. + +If the workshop uses secondary namespaces, the budget for these can similarly be [overridden](creating-additional-namespaces). + +**Recommendations** + +* Ensure that workshops specify a budget for the primary session namespace and any secondary namespaces. If they don't then a workshop user could consume all resources of the cluster. +* Ensure that if possible deployments made to the cluster from a workshop specify their own container resource requirements rather than falling back on using the default limit ranges. +* Ensure that if a workshop specifies a `custom` budget that it is providing appropriate `LimitRange` and `ResourceQuota` definitions. +* Ensure that if the session namespace is not required for a workshop, that access to the Kubernetes REST API is disabled. diff --git a/project-docs/production-guides/review-guidelines/workshop-container-cpu.md b/project-docs/production-guides/review-guidelines/workshop-container-cpu.md new file mode 100644 index 00000000..7e021bf7 --- /dev/null +++ b/project-docs/production-guides/review-guidelines/workshop-container-cpu.md @@ -0,0 +1,28 @@ +Workshop container CPU +====================== + +By default the workshop session container does not specify any resource requirement for CPU. This in general works fine as work done from the terminal in the workshop container is not usually CPU intensive or is short lived. In the event that many workshop sessions do run on the same node and they are all running CPU intensive tasks at the same time, what will happen is that Kubernetes will limit all CPU usage by the workshop container in proportion to others. As a result, it shouldn't be possible for any one container to completely monopolise the available CPU requirements as all will be limited as they don't specify a CPU requirement and thus CPU is provided as best effort. + +It is possible that this default strategy may not always work and the user experience could be less than optimal when the number of current workshop sessions is scaled up. + +A problematic case may be one where a workshop session runs commands from the terminal which are CPU intensive and long running (eg., GraalVM compilation). In this case were lots of workshop sessions scheduled to the same node and the CPU intensive periods overlap, they could starve each other for CPU and slow things down notably. + +In this case it would be recommended to run such a workshop on its own cluster with high CPU instance types for the nodes, and not have different workshops run on the same cluster. Further, a pod template [patch](patching-workshop-deployment) should be applied to the workshop container in order to specify a CPU request and limit so they get a guaranteed amount of CPU, but at the same time are also limited at the top end of how much they can use. + +```yaml +spec: + session: + patches: + containers: + - name: workshop + resources: + requests: + cpu: "500m" + limits: + cpu: "1000m" +``` + +**Recommendations** + +* Ensure that workshops with high CPU requirements for extended periods of time run on their own clusters with high CPU instance types for nodes. +* Ensure that workshops with high CPU requirements for extended periods of time specify request/limit values for CPU on the workshop container. diff --git a/project-docs/production-guides/review-guidelines/workshop-container-memory.md b/project-docs/production-guides/review-guidelines/workshop-container-memory.md new file mode 100644 index 00000000..cf0283ec --- /dev/null +++ b/project-docs/production-guides/review-guidelines/workshop-container-memory.md @@ -0,0 +1,31 @@ +Workshop container memory +========================= + +By default the container the workshop environment is running in is allocated 512Mi. If the editor is enabled this is increased to 1Gi. This memory is guaranteed, and as such Kubernetes will reserve that amount of memory on the node for the instance when scheduling pods. + +This amount of memory is inclusive of what you may need for any commands run from the command line. If doing code compilation (especially Java applications), you may need to increase the amount of memory dedicated to the workshop container. If the workshop uses the editor a lot, or the workshop encourages a workshop user to explore source code or other files supplied with the workshop, the memory required by the editor can keep growing, so keep that in consideration when deciding if the amount of memory needs to be increased. If you don't provide enough memory, then the editor or applications run from the command line could fail when they run out of memory. + +Memory available to the workshop container is [overridden](overriding-the-memory-available) using the `session.resources.memory` setting in the workshop definition. + +```yaml +spec: + session: + resources: + memory: 2Gi +``` + +**Recommendations** + +* Ensure that you do not enable the editor if it is not required. +* Ensure that you do not enable a Kubernetes web console if not required. +* Ensure you do analysis to determine the maximum amount of memory required by the workshop container and set the memory size appropriately. + +**Related issues** + +Note that if docker support is enabled and you are building/running containers within the workshop environment (not as deployments in the cluster), the memory allocation for dockerd is distinct from that for the workshop container and should not be counted in the above allocation. + +Note that increasing the size of nodes in the Kubernetes cluster such that more memory is available per node is not necessarily recommended as the amount of persistent volumes that can be mounted on a node is generally limited and does not increase with the amount of memory the node has. As such, a general guideline is to use nodes with 32Gi of memory (rather than 64Gi), and add as many nodes as required to the Kubernetes cluster on that basis. + +Note that the memory is guaranteed by setting the `requests` resource value the same as the `limits` value in the workshop container of the Kubernetes deployment. Although scheduling will take this into consideration with Kubernetes aiming to place the deployment on a node in the cluster which is known to have enough memory, you can still have problems when a node autoscalar is enabled. + +The issue with autoscalars is that if the resources of the node are not being sufficiently used, it may decide to evict anything deployed to the node forcing them to be redeployed on another node. An example of a Kubernetes cluster from an IaaS provider which uses an autoscalar is GKE autopilot. The consequence of using a cluster with autoscaling enabled is that workshop sessions can be interrupted and a users work is lost, with them being returned to the beginning when the new deployment of the workshop container starts up again. For this reason you should not deploy Educates on clusters where an autoscalar is configured which can see the number of nodes being scaled down automatically. diff --git a/project-docs/production-guides/review-guidelines/workshop-container-startup.md b/project-docs/production-guides/review-guidelines/workshop-container-startup.md new file mode 100644 index 00000000..e9ad283b --- /dev/null +++ b/project-docs/production-guides/review-guidelines/workshop-container-startup.md @@ -0,0 +1,25 @@ +Workshop container startup +========================== + +If a workshop needs to run special steps when the workshop container starts, it can provide executable shell scripts in the `workshop/setup.d` directory. These scripts will be run after the basic configuration is done for the workshop container, but before any application services or the dashboard are started. The scripts are self contained and setting environment variables in the scripts doesn't affect anything run after the scripts. + +The typical use for these setup scripts is to generate custom resource files to be used in the workshop which have been pre-filled with details specific to the session, such as the name of the session names, ingress hostname to use etc. + +If necessary, the setup scripts could be used to download separate application source code examples or binaries for tools required by a workshop, however, because these scripts are run during the process of starting up the workshop container and is done for every distinct workshop instance, a long running script will delay startup of the workshop container. If the scripts take too long to run, a user may think the workshop session is broken and abandon it. A long running script could also interfere with the timeout mechanisms in place to determine whether the workshop user has accessed the workshop session. + +It is strongly recommended that these setup scripts not perform any action which takes more than 10 seconds. This means they are not really viable for downloading very large binary packages for tools. For example, it wouldn't be practical to download a JDK environment from a setup script. You also couldn't use them to pre-run a Java compilation. + +When it is mentioned that the shell script needs to be executable, this means it must have the file execute bit set (`chmod +x`). If it is not marked as executable, it will not be run. + +Any output from the setup script will be automatically appended to the file `~/.local/share/workshop/setup-scripts.log` in the workshop container, and to the workshop pod logs. It is not necessary for the setup scripts to try and capture output and log it to its own log file. + +**Recommendations** + +* Ensure that setup scripts do not try to log their own output to a file so that the output is capture in the default log file locations. +* Ensure that setup scripts only perform actions which take a short amount of time. Recommended less than 10 seconds. +* Ensure that setup scripts will not fail if run more than once as might occur if the workshop container was stopped and restarted. +* Ensure that setup scripts don't run application services in background, this is what the supervisord instance is for. + +**Related issues** + +Due to limitations with file permissions not being correctly preserved by `vendir` when unpacking archives, any scripts in the `workshop/setup.d` are currently being marked as executable automatically. For future compatibility such scripts should still be marked as executable in case the workaround is rolled back. diff --git a/project-docs/production-guides/review-guidelines/workshop-container-storage.md b/project-docs/production-guides/review-guidelines/workshop-container-storage.md new file mode 100644 index 00000000..354622f3 --- /dev/null +++ b/project-docs/production-guides/review-guidelines/workshop-container-storage.md @@ -0,0 +1,29 @@ +Workshop container storage +========================== + +Workshops should write any files created as part of the workshop under the home directory of the workshop user. By default this directory is normal transient filesystem space within the container and as such all workshop instances running on the same Kubernetes node will be competing for whatever free disk space is made available on the node for container filesystems. + +If a workshop downloads a large amount of binaries/source code into the workshop container, runs compilation steps, or downloads a large number of packages required when compiling applications (especially when using Java or NodeJS), if too many instances of a workshop are running at the same time on the Kubernetes node then you can feasibly run out of filesystem space on the node. The result of running low on filesystem space on the node can be that pods will be evicted from the node, which in the case of workshop instances would result in loss of any work and a user would need to start over. + +To avoid the possibility of running out of filesystem space when a workshop requires a large amount of transient filesystem space, storage space should be allocated to the workshop container for the home directory of the workshop user. Doing this will result in a persistent volume claim being made for the amount of storage space requested and that persistent volume will be mounted on the home directory of the workshop user. Memory available to the workshop container is [overridden](mounting-a-persistent-volume) using the `session.resources.storage setting` in the workshop definition. + +```yaml +spec: + session: + resources: + storage: 5Gi +``` + +**Recommendations** + +* Ensure that storage is allocated for the workshop container if the workshop produces a large amount of file data during the workshop. + +**Related Issues** + +Note that if using a custom workshop image which bundles required files as part of the workshop and they are in the home directory, the persistent volume will be mounted on top. To deal with this the workshop environment when storage space is requested, will run an init container which will first copy the contents of the home directory from the custom workshop image into the persistent volume. The persistent volume is then mounted on the home directory when the main container is run. This means any files under the home directory in the custom workshop image will be transparently transferred to the persistent volume for you, without you needing to do anything. Do be aware though that if there is a large amount of files to be copied, this can delay startup of the workshop session. + +Note that you cannot use storage space allocations that may work on local Kubernetes clusters using Kind or Minikube as a guide for how much you should use. This is because the amount of storage requested is ignored in the case of those Kubernetes clusters and instead you can always use up to as much space as the VM or host system in which the Kubernetes cluster is running has for file system space. + +Note that for many infrastructure providers usually there is a limited number of persistent volumes that can be mounted on each node. The number of persistent volumes does not increase if the amount of memory the node has is increased, but stays the same. As a result, where persistent volumes are required for workshop instances, increasing the amount of memory per node in the cluster to pack more workshop instances into a node will not help as there may not be enough persistent volumes for the number of workshop instances that could fit in the node. As such, a general guideline is to use nodes with 32Gi of memory (rather than 64Gi), and add as many nodes as required to the Kubernetes cluster on that basis. + +Note that there is always a risk that a workshop user could try and run a form of denial of service attack against a Kubernetes cluster, even when storage space is allocated, by writing large amounts of data into another writable part of the filesystem such as `/tmp`. How big of a problem this could be would depend on whether the Kubernetes cluster implements some sort of limits on how much transient container filesystem space any one container can use. If there is no limit there is the risk that the complete file system for the node allocated for container use could be used up, affecting other applications running on the same node. diff --git a/project-docs/production-guides/review-guidelines/workshop-user-admin-access.md b/project-docs/production-guides/review-guidelines/workshop-user-admin-access.md new file mode 100644 index 00000000..0d12d70c --- /dev/null +++ b/project-docs/production-guides/review-guidelines/workshop-user-admin-access.md @@ -0,0 +1,18 @@ +Workshop user admin access +========================== + +By default workshop users will have admin role within their session namespaces if granted access to the cluster. They do not have the ability to perform cluster admin actions on the Kubernetes cluster itself. + +A workshop user could be granted cluster admin access by [overriding the default RBAC rules](overriding-default-rbac-rules) but this should never be allowed for workshops running in shared clusters where public or untrusted users can access the workshop. + +Cluster admin access should only ever be used where a workshop is being run by a user themselves on their own cluster running on their local machine, with their own workshops. + +The obvious risk of a workshop granting cluster admin access is that the user has access they shouldn't otherwise and if it is a shared cluster they could interfere with other users, access secret information, or break the cluster itself. + +If a workshop does require a level of cluster admin access, then virtual clusters could instead be used. A virtual cluster will provide each workshop user their own separate Kubernetes cluster with cluster admin access. This will allow them to perform operations within the cluster as an admin, but they will still not be able to access the underlying UNIX system nodes, or reconfigure the Kubernetes control plane. + +**Recommendations** + +* Ensure that workshops do not give the workshop user cluster admin access. +* Ensure that cluster admin access or other elevated privileges is never given to service accounts in session namespaces the workshop user can access. +* Use a virtual cluster if a workshop needs to demonstrate functions requiring cluster admin access. diff --git a/project-docs/project-details/project-overview.md b/project-docs/project-details/project-overview.md index 56c60cea..6441416b 100644 --- a/project-docs/project-details/project-overview.md +++ b/project-docs/project-details/project-overview.md @@ -8,7 +8,7 @@ Although the principal deployment platform for Educates is Kubernetes, and is be Latest project version ---------------------- -The latest release of Educates is version 2.6.16. +The latest release of Educates is version 2.7.0. Source code repositories ------------------------ diff --git a/project-docs/requirements.txt b/project-docs/requirements.txt index c7a08664..e365180b 100644 --- a/project-docs/requirements.txt +++ b/project-docs/requirements.txt @@ -1,12 +1,5 @@ -sphinx==4.3.0 -sphinx-rtd-theme==1.0.0 -recommonmark==0.6.0 -myst-parser==0.17.0 -attrs==21.4.0 - -# Pin due to CVEs. -certifi==2023.07.22 -markdown-it-py==2.2.0 -requests==2.31.0 -pygments==2.15.0 -urllib3==1.26.18 +sphinx<7.2.0 +sphinx-rtd-theme==2.0.0 +recommonmark==0.7.1 +myst-parser==2.0.0 +attrs==23.2.0 diff --git a/secrets-manager/main.py b/secrets-manager/main.py index c3122693..f9a9524e 100644 --- a/secrets-manager/main.py +++ b/secrets-manager/main.py @@ -27,7 +27,8 @@ def configure(settings: kopf.OperatorSettings, **_): settings.posting.level = logging.DEBUG settings.watching.connect_timeout = 1 * 60 - settings.watching.server_timeout = 10 * 60 + settings.watching.server_timeout = 5 * 60 + settings.watching.client_timeout = settings.watching.server_timeout + 10 @kopf.on.login() diff --git a/session-manager/main.py b/session-manager/main.py index ba3febfc..b972945e 100644 --- a/session-manager/main.py +++ b/session-manager/main.py @@ -32,7 +32,8 @@ def configure(settings: kopf.OperatorSettings, **_): settings.posting.level = logging.DEBUG settings.watching.connect_timeout = 1 * 60 - settings.watching.server_timeout = 10 * 60 + settings.watching.server_timeout = 5 * 60 + settings.watching.client_timeout = settings.watching.server_timeout + 10 @kopf.on.login() diff --git a/training-portal/requirements.txt b/training-portal/requirements.txt index 0466ce2e..422e61d5 100644 --- a/training-portal/requirements.txt +++ b/training-portal/requirements.txt @@ -1,5 +1,5 @@ mod_wsgi==5.0.0 -Django==4.2.10 +Django==4.2.11 django-registration==3.4 django-crispy-forms==2.1 crispy-bootstrap5==2023.10 diff --git a/training-portal/src/project/apps/workshops/manager/operator.py b/training-portal/src/project/apps/workshops/manager/operator.py index 2aa37df7..02e78552 100644 --- a/training-portal/src/project/apps/workshops/manager/operator.py +++ b/training-portal/src/project/apps/workshops/manager/operator.py @@ -138,7 +138,8 @@ def wrapper(*args, **kwargs): def configure(settings: kopf.OperatorSettings, **_): settings.posting.level = logging.DEBUG settings.watching.connect_timeout = 1 * 60 - settings.watching.server_timeout = 10 * 60 + settings.watching.server_timeout = 5 * 60 + settings.watching.client_timeout = settings.watching.server_timeout + 10 @kopf.on.login() diff --git a/tunnel-manager/main.py b/tunnel-manager/main.py index eb636664..e847e65a 100644 --- a/tunnel-manager/main.py +++ b/tunnel-manager/main.py @@ -42,7 +42,8 @@ def xgetattr(obj, key, default=None): def configure(settings: kopf.OperatorSettings, **_): settings.posting.level = logging.DEBUG settings.watching.connect_timeout = 1 * 60 - settings.watching.server_timeout = 10 * 60 + settings.watching.server_timeout = 5 * 60 + settings.watching.client_timeout = settings.watching.server_timeout + 10 @kopf.on.login() diff --git a/workshop-images/base-environment/opt/gateway/package-lock.json b/workshop-images/base-environment/opt/gateway/package-lock.json index c7341854..9e67f5ee 100644 --- a/workshop-images/base-environment/opt/gateway/package-lock.json +++ b/workshop-images/base-environment/opt/gateway/package-lock.json @@ -1843,9 +1843,9 @@ "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==" }, "node_modules/follow-redirects": { - "version": "1.15.5", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.5.tgz", - "integrity": "sha512-vSFWUON1B+yAw1VN4xMfxgn5fTUiaOzAJCKBwIIgT/+7CuGy9+r+5gITvP62j3RmaD5Ph65UaERdOSRGUzZtgw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "funding": [ { "type": "individual", @@ -5965,9 +5965,9 @@ "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==" }, "follow-redirects": { - "version": "1.15.5", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.5.tgz", - "integrity": "sha512-vSFWUON1B+yAw1VN4xMfxgn5fTUiaOzAJCKBwIIgT/+7CuGy9+r+5gITvP62j3RmaD5Ph65UaERdOSRGUzZtgw==" + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==" }, "fontfaceobserver": { "version": "2.3.0",