Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

podspec: add additional features for podspec #231

Merged
merged 4 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Build the manager binary
FROM golang:1.20 as builder
FROM golang:1.20 AS builder

WORKDIR /workspace

Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha2/minicluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ type PodSpec struct {
// +optional
ServiceAccountName string `json:"serviceAccountName,omitempty"`

// RuntimeClassName for the pod
// +optional
RuntimeClassName string `json:"runtimeClassName,omitempty"`

// Automatically mount the service account name
// +optional
AutomountServiceAccountToken bool `json:"automountServiceAccountToken,omitempty"`
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha2/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,10 @@
"description": "Restart Policy",
"type": "string"
},
"runtimeClassName": {
"description": "RuntimeClassName for the pod",
"type": "string"
},
"schedulerName": {
"description": "Scheduler name for the pod",
"type": "string"
Expand Down
7 changes: 7 additions & 0 deletions api/v1alpha2/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions chart/templates/minicluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,9 @@ spec:
restartPolicy:
description: Restart Policy
type: string
runtimeClassName:
description: RuntimeClassName for the pod
type: string
schedulerName:
description: Scheduler name for the pod
type: string
Expand Down
3 changes: 3 additions & 0 deletions config/crd/bases/flux-framework.org_miniclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,9 @@ spec:
restartPolicy:
description: Restart Policy
type: string
runtimeClassName:
description: RuntimeClassName for the pod
type: string
schedulerName:
description: Scheduler name for the pod
type: string
Expand Down
13 changes: 11 additions & 2 deletions controllers/flux/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) {
Labels: cluster.Spec.JobLabels,
},

// Completions must be == to Parallelism to allow for scaling
Spec: batchv1.JobSpec{

BackoffLimit: &backoffLimit,
Completions: &cluster.Spec.Size,
Parallelism: &cluster.Spec.Size,
Expand All @@ -70,13 +70,22 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) {
ImagePullSecrets: getImagePullSecrets(cluster),
ServiceAccountName: cluster.Spec.Pod.ServiceAccountName,
AutomountServiceAccountToken: &cluster.Spec.Pod.AutomountServiceAccountToken,
RestartPolicy: corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy),
RestartPolicy: corev1.RestartPolicyOnFailure,
NodeSelector: cluster.Spec.Pod.NodeSelector,
SchedulerName: cluster.Spec.Pod.SchedulerName,
},
},
},
}
// Custom restart policy
if cluster.Spec.Pod.RestartPolicy != "" {
job.Spec.Template.Spec.RestartPolicy = corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy)
}

// Only add runClassName if defined
if cluster.Spec.Pod.RuntimeClassName != "" {
job.Spec.Template.Spec.RuntimeClassName = &cluster.Spec.Pod.RuntimeClassName
}

// Add Affinity to map one pod / node only if the user hasn't disbaled it
if !cluster.Spec.Network.DisableAffinity {
Expand Down
12 changes: 11 additions & 1 deletion controllers/flux/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,23 @@ func (r *MiniClusterReconciler) newServicePod(
SetHostnameAsFQDN: &setAsFQDN,
Volumes: existingVolumes,
ImagePullSecrets: getImagePullSecrets(cluster),
RestartPolicy: corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy),
RestartPolicy: corev1.RestartPolicyOnFailure,
ServiceAccountName: cluster.Spec.Pod.ServiceAccountName,
AutomountServiceAccountToken: &cluster.Spec.Pod.AutomountServiceAccountToken,
NodeSelector: cluster.Spec.Pod.NodeSelector,
},
}

// Custom restart policy
if cluster.Spec.Pod.RestartPolicy != "" {
pod.Spec.RestartPolicy = corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy)
}

// Only add runClassName if defined
if cluster.Spec.Pod.RuntimeClassName != "" {
pod.Spec.RuntimeClassName = &cluster.Spec.Pod.RuntimeClassName
}

// Assemble existing volume mounts - they are added with getContainers
mounts := []corev1.VolumeMount{}

Expand Down
37 changes: 33 additions & 4 deletions docs/getting_started/custom-resource-definition.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,11 @@ When enabled, meaning that we use flux from a view within the container, these c

- [ghcr.io/converged-computing/flux-view-rocky:tag-9](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-rocky)
- [ghcr.io/converged-computing/flux-view-rocky:tag-8](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-rocky)
- [ghcr.io/converged-computing/flux-view-ubuntu:tag-noble](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu)
- [ghcr.io/converged-computing/flux-view-ubuntu:tag-jammy](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu)
- [ghcr.io/converged-computing/flux-view-ubuntu:tag-focal](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu)


Note that we have [arm builds](https://github.com/converged-computing/flux-views/tree/main/arm) available for each of rocky and ubuntu as well.
If you don't want to use Flux from a view (and want to use the v1apha1 design of the Flux Operator that had the application alongside Flux) you can do that by way of disabling the flux view:

Expand Down Expand Up @@ -682,6 +685,34 @@ pod:
serviceAccountName: my-service-account
```

#### restartPolicy

To customize the restartPolicy for the pod:

```yaml
pod:
restartPolicy: Never
```

#### runtimeClassName

To add a runtime class name:

```yaml
pod:
runtimeClassName: nvidia
```

#### automountServiceAccountToken

If you want to automatically mount a service account token:

```yaml
pod:
automountServiceAccountToken: true
```


#### nodeSelector

A node selector is a set of key value pairs that helps to schedule pods to the right nodes! You can
Expand Down Expand Up @@ -720,10 +751,8 @@ name: rabbit

#### image

This is the only required attribute! You *must* provide a container base that has Flux.
The requirements of your container are defined in the README of the [flux-hpc](https://github.com/rse-ops/flux-hpc/)
repository. Generally speaking, you need to have Flux executables, Flux Python bindings,
and your own executables on the path, and should be started with root with a flux user.
You do not need to provide a container base that has Flux, but you must make sure your view (with a particular operator system) that will add Flux matches your container. We don't require you to start as root, but if you
have a container with a non-root user, the user needs to have sudo available (to act as root).
If you use the [fluxrm/flux-sched](https://hub.docker.com/r/fluxrm/flux-sched)
base containers this is usually a good start.

Expand Down
3 changes: 3 additions & 0 deletions examples/dist/flux-operator-arm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,9 @@ spec:
restartPolicy:
description: Restart Policy
type: string
runtimeClassName:
description: RuntimeClassName for the pod
type: string
schedulerName:
description: Scheduler name for the pod
type: string
Expand Down
3 changes: 3 additions & 0 deletions examples/dist/flux-operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,9 @@ spec:
restartPolicy:
description: Restart Policy
type: string
runtimeClassName:
description: RuntimeClassName for the pod
type: string
schedulerName:
description: Scheduler name for the pod
type: string
Expand Down
23 changes: 12 additions & 11 deletions pkg/flux/templates/components.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ url=$goshareUrl/wait-fs-{{ .Spec.Flux.Arch }}
# This waiting script is intended to wait for the flux view, and then start running
curl -L -O -s -o ./wait-fs -s ${url} {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} || wget ${url} -q -O ./wait-fs {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} || true
chmod +x ./wait-fs || true
mv ./wait-fs /usr/bin/goshare-wait-fs || true
${SUDO} mv ./wait-fs /usr/bin/goshare-wait-fs || true

# Ensure spack view is on the path, wherever it is mounted
viewbase="{{ .ViewBase }}"
Expand All @@ -47,9 +47,9 @@ goshare-wait-fs -p ${viewbase}/flux-operator-done.txt {{ if .Spec.Logging.Quiet
# Copy mount software to /opt/software
# If /opt/software already exists, we need to copy into it
if [[ -e "/opt/software" ]]; then
cp -R ${viewbase}/software/* /opt/software/ || true
${SUDO} cp -R ${viewbase}/software/* /opt/software/ || true
else
cp -R ${viewbase}/software /opt/software || true
${SUDO} cp -R ${viewbase}/software /opt/software || true
fi
{{end}}

Expand All @@ -72,10 +72,10 @@ echo "Python root: $foundroot" {{ if .Spec.Logging.Quiet }} > /dev/null 2>&1{{ e

# If we found the right python, ensure it's linked (old link does not work)
if [[ -f "${pythonversion}" ]]; then
rm -rf $viewroot/bin/python3
rm -rf $viewroot/bin/python
ln -s ${pythonversion} $viewroot/lib/python || true
ln -s ${pythonversion} $viewroot/lib/python3 || true
${SUDO} rm -rf $viewroot/bin/python3
${SUDO} rm -rf $viewroot/bin/python
${SUDO} ln -s ${pythonversion} $viewroot/lib/python || true
${SUDO} ln -s ${pythonversion} $viewroot/lib/python3 || true
fi

# Ensure we use flux's python (TODO update this to use variable)
Expand All @@ -87,15 +87,16 @@ find $viewroot . -name libpython*.so* {{ if .Spec.Logging.Quiet }}> /dev/null 2>
ls -l /mnt/flux/view/lib/libpython3.11.so.1.0 {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}

# Write an easy file we can source for the environment
cat <<EOT >> ${viewbase}/flux-view.sh
cat <<EOT >> ./flux-view.sh
#!/bin/bash
export PATH=$PATH
export PYTHONPATH=$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$viewroot/lib
export fluxsocket=local://${viewroot}/run/flux/local
EOT
${SUDO} mv ./flux-view.sh ${viewbase}/flux-view.sh
{{end}}
{{define "ensure-pip"}}
${pythonversion} -m pip --version || ${pythonversion} -m ensurepip || (wget https://bootstrap.pypa.io/get-pip.py && ${pythonversion} ./get-pip.py) {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
${pythonversion} -m pip --upgrade pip {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
{{end}}
${SUDO} ${pythonversion} -m pip --version || ${SUDO} ${pythonversion} -m ensurepip || (${SUDO} wget https://bootstrap.pypa.io/get-pip.py && ${pythonversion} ./get-pip.py) {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
${SUDO} ${pythonversion} -m pip --upgrade pip {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
{{end}}
41 changes: 26 additions & 15 deletions pkg/flux/templates/wait.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
# We use the actual time command and not the wrapper, otherwise we get there is no argument -f
{{ if .Spec.Logging.Timed }}which /usr/bin/time > /dev/null 2>&1 || (echo "/usr/bin/time is required to use logging.timed true" && exit 1);{{ end }}

# Set the flux user and id from the getgo
fluxuser=$(whoami)
fluxuid=$(id -u $fluxuser)

# Add fluxuser to sudoers living... dangerously!
# A non root user container requires sudo to work
SUDO=""
if [[ "${fluxuser}" != "root" ]]; then
echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
SUDO="sudo"
fi

# If any initCommand logic is defined
{{ .Container.Commands.Init}} {{ if .Spec.Logging.Quiet }}> /dev/null{{ end }}

Expand All @@ -14,10 +26,6 @@
{{template "wait-view" .}}
{{ if not .Spec.Flux.Container.Disable }}{{template "paths" .}}{{ end }}

# Set the flux user and id from the getgo
fluxuser=$(whoami)
fluxuid=$(id -u $fluxuser)

# Variables we can use again
cfg="${viewroot}/etc/flux/config"
command="{{ .Container.Command }}"
Expand All @@ -28,19 +36,21 @@ command="{{ .Container.Command }}"
{{ if not .Spec.Logging.Quiet }}
echo
echo "Hello user ${fluxuser}"{{ end }}

# Add fluxuser to sudoers living... dangerously!
if [[ "${fluxuser}" != "root" ]]; then
echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
fi

# Ensure the flux user owns the curve.cert
# We need to move the curve.cert because config map volume is read only
curvesrc=/flux_operator/curve.cert
curvepath=$viewroot/curve/curve.cert

mkdir -p $viewroot/curve
cp $curvesrc $curvepath
# run directory must be owned by this user
# and /var/lib/flux
if [[ "${fluxuser}" != "root" ]]; then
${SUDO} chown -R ${fluxuser} ${viewroot}/run/flux ${viewroot}/var/lib/flux
fi

# Prepare curve certificate!
${SUDO} mkdir -p $viewroot/curve
${SUDO} cp $curvesrc $curvepath
{{ if not .Spec.Logging.Quiet }}
echo
echo "🌟️ Curve Certificate"
Expand All @@ -49,9 +59,9 @@ cat ${curvepath}
{{ end }}

# Remove group and other read
chmod o-r ${curvepath}
chmod g-r ${curvepath}
chown -R ${fluxuid} ${curvepath}
${SUDO} chmod o-r ${curvepath}
${SUDO} chmod g-r ${curvepath}
${SUDO} chown -R ${fluxuid} ${curvepath}

# If we have disabled the view, we need to use the flux here to generate resources
{{ if .Spec.Flux.Container.Disable }}
Expand All @@ -61,7 +71,8 @@ echo
echo "📦 Resources"
echo "flux R encode --hosts=${hosts} --local"
{{ end }}
flux R encode --hosts=${hosts} --local > ${viewroot}/etc/flux/system/R
flux R encode --hosts=${hosts} --local > /tmp/R
${SUDO} mv /tmp/R ${viewroot}/etc/flux/system/R
{{ if not .Spec.Logging.Quiet }}cat ${viewroot}/etc/flux/system/R{{ end }}
{{ end }}

Expand Down
1 change: 1 addition & 0 deletions sdk/python/v1alpha2/docs/PodSpec.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Name | Type | Description | Notes
**node_selector** | **dict[str, str]** | NodeSelectors for a pod | [optional]
**resources** | [**dict[str, IntOrString]**](IntOrString.md) | Resources include limits and requests | [optional]
**restart_policy** | **str** | Restart Policy | [optional]
**runtime_class_name** | **str** | RuntimeClassName for the pod | [optional]
**scheduler_name** | **str** | Scheduler name for the pod | [optional]
**service_account_name** | **str** | Service account name for the pod | [optional]

Expand Down
Loading
Loading