Skip to content

Commit

Permalink
Add more tests to cover contaerind/driver container k8s deployments
Browse files Browse the repository at this point in the history
  • Loading branch information
supertetelman committed Mar 24, 2022
1 parent e6baed1 commit d30b231
Show file tree
Hide file tree
Showing 3 changed files with 310 additions and 16 deletions.
7 changes: 5 additions & 2 deletions virtual/scripts/setup_k8s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@ K8S_CONFIG_DIR="${VIRT_DIR}/config"

ansible_extra_args=""
if [ ${DEEPOPS_K8S_OPERATOR_EXISTING_SOFTWARE} ]; then
ansible_extra_args="${ansible_extra_args} -e deepops_gpu_operator_enabled=true -e gpu_operator_preinstalled_nvidia_software=true"
ansible_extra_args="${ansible_extra_args} -e deepops_gpu_operator_enabled=true -e gpu_operator_preinstalled_nvidia_software=${DEEPOPS_K8S_OPERATOR_EXISTING_SOFTWARE}"
elif [ ${DEEPOPS_K8S_OPERATOR} ]; then
ansible_extra_args="${ansible_extra_args} -e deepops_gpu_operator_enabled=true"
ansible_extra_args="${ansible_extra_args} -e deepops_gpu_operator_enabled=${DEEPOPS_K8S_OPERATOR}"
fi
if [ ${DEEPOPS_K8S_CONTAINER_MANAGER} ]; then
ansible_extra_args="${ansible_extra_args} -e container_manager=${DEEPOPS_K8S_CONTAINER_MANAGER}"
fi

# Deploy the K8s cluster
Expand Down
173 changes: 166 additions & 7 deletions workloads/jenkins/Jenkinsfile-multi-nightly
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ pipeline {
agent any
environment {
DEEPOPS_NIGHTLY = 'true'
// DEEPOPS_FULL_INSTALL = 'true'
DEEPOPS_FULL_INSTALL = 'true'
// DEEPOPS_VAGRANT_OS = 'ubuntu'
// DEEPOPS_OS_VERSION = '18.04'
}
Expand All @@ -19,7 +19,6 @@ pipeline {
}
steps {
// TODO: ideally lock should work with declared stages
// TODO: determine how to pass a variable into quantity and merge this with the nightly jenkinsfile
lock(resource: null, label: 'gpu', quantity: 2, variable: 'GPUDATA') {
echo "Reset repo and unmunge files"
sh '''
Expand All @@ -43,8 +42,10 @@ pipeline {
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''

echo "Cluster Up - MGMT Nodes"
echo "Cluster Up - MGMT Nodes - device plugin + docker"
sh '''
export DEEPOPS_K8S_OPERATOR=false
export DEEPOPS_K8S_CONTAINER_MANAGER=docker
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''

Expand Down Expand Up @@ -80,7 +81,7 @@ pipeline {

echo "Test Monitoring installation"
sh '''
timeout 2000 bash -x ./workloads/jenkins/scripts/test-monitoring.sh
timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh
'''

echo "Test Dashboard installation"
Expand All @@ -93,14 +94,172 @@ pipeline {
timeout 1500 bash -x ./workloads/jenkins/scripts/test-kubeflow-pipeline.sh
'''

echo "Start new virtual environment pre-GPU Operator checks"
echo "Start new virtual environment"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''

echo "Cluster Up - MGMT Nodes"
echo "Cluster Up - MGMT Nodes gpu operator + docker"
sh '''
export DEEPOPS_K8S_OPERATOR=true
export DEEPOPS_K8S_CONTAINER_MANAGER=docker
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
export DEEPOPS_K8S_OPERATOR=true
bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
export DEEPOPS_K8S_OPERATOR=true
timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh
'''

echo "Verify ingress config"
sh '''
bash -x ./workloads/jenkins/scripts/verify-ingress-config.sh
'''

echo "Verify local docker registry"
sh '''
bash -x ./workloads/jenkins/scripts/test-local-registry.sh
'''

echo "Verify rsyslog forwarding is working for the k8s cluster"
sh '''
bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh
'''

echo "Test Monitoring installation"
sh '''
timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh
'''

echo "Test Dashboard installation"
sh '''
timeout 180 bash -x ./workloads/jenkins/scripts/test-dashboard.sh
'''

echo "Start new virtual environment"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''

echo "Cluster Up - MGMT Nodes gpu operator + containerd"
sh '''
export DEEPOPS_K8S_OPERATOR=true
export DEEPOPS_K8S_CONTAINER_MANAGER=containerd
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
export DEEPOPS_K8S_OPERATOR=true
bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
export DEEPOPS_K8S_OPERATOR=true
timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh
'''

echo "Verify ingress config"
sh '''
bash -x ./workloads/jenkins/scripts/verify-ingress-config.sh
'''

echo "Verify local docker registry"
sh '''
echo "unsupported configuration" # TODO bash -x ./workloads/jenkins/scripts/test-local-registry.sh
'''

echo "Verify rsyslog forwarding is working for the k8s cluster"
sh '''
bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh
'''

echo "Test Kubeflow installation"
sh '''
timeout 4000 bash -x ./workloads/jenkins/scripts/test-kubeflow.sh
'''

echo "Test Monitoring installation"
sh '''
timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh
'''

echo "Test Dashboard installation"
sh '''
timeout 180 bash -x ./workloads/jenkins/scripts/test-dashboard.sh
'''

echo "Test Kubeflow pipeline"
sh '''
timeout 1500 bash -x ./workloads/jenkins/scripts/test-kubeflow-pipeline.sh
'''

echo "Start new virtual environment pre-GPU Operator with existing software checks"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''

echo "Cluster Up - MGMT Nodes gpu operator + docker + drivers"
sh '''
export DEEPOPS_K8S_OPERATOR_EXISTING_SOFTWARE=true
export DEEPOPS_K8S_CONTAINER_MANAGER=docker
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
export DEEPOPS_K8S_OPERATOR=true
bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
export DEEPOPS_K8S_OPERATOR=true
timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh
'''

echo "Verify ingress config"
sh '''
bash -x ./workloads/jenkins/scripts/verify-ingress-config.sh
'''

echo "Verify local docker registry"
sh '''
bash -x ./workloads/jenkins/scripts/test-local-registry.sh
'''

echo "Verify rsyslog forwarding is working for the k8s cluster"
sh '''
bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh
'''

echo "Test Monitoring installation"
sh '''
timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh
'''

echo "Test Dashboard installation"
sh '''
timeout 180 bash -x ./workloads/jenkins/scripts/test-dashboard.sh
'''

echo "Start new virtual environment"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''

echo "Cluster Up - MGMT Nodes gpu operator + containerd + drivers"
sh '''
export DEEPOPS_K8S_OPERATOR_EXISTING_SOFTWARE=true
export DEEPOPS_K8S_CONTAINER_MANAGER=containerd
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''

Expand Down Expand Up @@ -163,7 +322,7 @@ pipeline {

echo "Test DCGM metrics"
sh '''
timeout 600 bash -x ./workloads/jenkins/scripts/test-dcgm-metrics.sh slurm-node
timeout 500 bash -x ./workloads/jenkins/scripts/test-dcgm-metrics.sh slurm-node
'''

echo "Reset repo and unmunge files"
Expand Down
Loading

0 comments on commit d30b231

Please sign in to comment.