fix: enhance cluster setup script for model training #137
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Workflow on Push | |
# - unit test | |
# - build image (push if docker-secret (IMAGE_REPO, BOT_NAME, BOT_TOKEN) exists) | |
# - collect data (if aws-secret exists) | |
# - train model (if aws-secret exists) | |
name: Workflow on Push/PR | |
on: | |
pull_request: | |
paths-ignore: | |
- 'fig/**' | |
- '.github/ISSUE_TEMPLATE/**' | |
- '.vscode/**' | |
- 'LICENSE' | |
- '.gitignore' | |
- '*.md' | |
- '**/*.md' | |
env: | |
TAG: "v0.7.7" | |
jobs: | |
check-branch: | |
runs-on: ubuntu-latest | |
outputs: | |
tag: ${{ steps.image-tag.outputs.tag }} | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Find Image Tag | |
id: image-tag | |
env: | |
BRANCH: ${{ github.ref_name }} | |
COMMIT: ${{ github.sha }} | |
run: | | |
if [ "${{ github.event_name }}" == 'pull_request' ]; then | |
echo "tag=pr-${{ github.event.number }}" >> "$GITHUB_OUTPUT" | |
else | |
if [ "$BRANCH" == "main" ]; then | |
echo "tag=${{ env.TAG }}" >> "$GITHUB_OUTPUT" | |
else | |
echo "tag=$COMMIT" >> "$GITHUB_OUTPUT" | |
fi | |
fi | |
check-change: | |
runs-on: ubuntu-latest | |
outputs: | |
base: ${{ steps.filter.outputs.base }} | |
data: ${{ steps.filter.outputs.data }} | |
modeling: ${{ steps.filter.outputs.modeling }} | |
s3: ${{ steps.filter.outputs.s3 }} | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: dorny/paths-filter@v3 | |
id: filter | |
with: | |
filters: | | |
base: | |
- 'pyproject.toml' | |
- 'dockerfiles/Dockerfile.base' | |
- '.github/workflows/build-push.yml' | |
data: | |
- 'src/util/prom_types.py' | |
- 'src/util/train_types.py' | |
- 'src/train/prom/**' | |
- 'model_training/tekton/tasks/stressng-task.yaml' | |
- 'model_training/tekton/pipelines/collect.yaml' | |
- 'hack/**' | |
- '.github/workflows/collect-data-self-hosted.yml' | |
modeling: | |
- 'src/**' | |
- 'model_training/**' | |
- 'hack/**' | |
- '.github/workflows/collect-data-self-hosted.yml' | |
- '.github/workflows/train-model.yml' | |
s3: | |
- 'model_training/s3/**' | |
check-secret: | |
runs-on: ubuntu-latest | |
outputs: | |
docker-secret: ${{ steps.check-docker-secret.outputs.available }} | |
steps: | |
- name: Check Docker Secret | |
id: check-docker-secret | |
env: | |
DOCKER_SECRET: ${{ secrets.BOT_TOKEN}} | |
run: | | |
if [ "$DOCKER_SECRET" == "" ]; then | |
echo "available=false" >> "$GITHUB_OUTPUT" | |
else | |
echo "available=true" >> "$GITHUB_OUTPUT" | |
fi | |
check-base-exist: | |
runs-on: ubuntu-latest | |
outputs: | |
exists: ${{ steps.check-base-exist.outputs.exists }} | |
steps: | |
- name: Check if Docker base image exists | |
id: check-base-exist | |
run: | | |
if docker pull ${{ env.BASE_IMAGE }}; then | |
echo "exists=true" >> "$GITHUB_OUTPUT" | |
else | |
echo "exists=false" >> "$GITHUB_OUTPUT" | |
fi | |
unit-test: | |
needs: [check-change] | |
uses: ./.github/workflows/unit-test.yml | |
with: | |
base_change: ${{ needs.check-change.outputs.base }} | |
base-image: | |
if: ${{ (needs.check-base-exist.outputs.exists == 'false') || (needs.check-change.outputs.base == 'true') }} | |
needs: [check-base-exist, check-branch, check-change] | |
runs-on: ubuntu-latest | |
outputs: | |
change: ${{ steps.record.outputs.change }} | |
env: | |
tag: ${{ needs.check-branch.outputs.tag }} | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: set up QEMU | |
uses: docker/setup-qemu-action@v3 | |
- name: set up Docker Buildx | |
uses: docker/setup-buildx-action@v3 | |
- name: Build base image | |
uses: docker/build-push-action@v6 | |
with: | |
push: false | |
tags: ${{ vars.IMAGE_REPO || 'docker.io/library' }}/kepler_model_server_base:${{ needs.check-branch.outputs.tag }} | |
file: dockerfiles/Dockerfile.base | |
- name: Record change | |
id: record | |
run: | | |
echo "change=true" >> "$GITHUB_OUTPUT" | |
s3-image: | |
if: ${{ needs.check-change.outputs.s3 == 'true' }} | |
needs: [check-change, check-branch] | |
runs-on: ubuntu-latest | |
outputs: | |
change: ${{ steps.record.outputs.change }} | |
env: | |
tag: ${{ needs.check-branch.outputs.tag }} | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: set up QEMU | |
uses: docker/setup-qemu-action@v3 | |
- name: set up Docker Buildx | |
uses: docker/setup-buildx-action@v3 | |
- name: Build s3 image | |
uses: docker/build-push-action@v6 | |
with: | |
push: false | |
load: true | |
tags: ${{ vars.IMAGE_REPO || 'docker.io/library' }}/kepler_model_server/s3:${{ needs.check-branch.outputs.tag }} | |
context: model_training/s3 | |
file: model_training/s3/Dockerfile | |
- name: Test s3 image | |
run: | | |
img=${{ vars.IMAGE_REPO || 'docker.io/library' }}/kepler_model_server/s3:${{ needs.check-branch.outputs.tag }} | |
docker run --rm $img s3-pusher --version | |
docker run --rm $img s3-loader --version | |
- name: Record change | |
id: record | |
run: | | |
echo "change=true" >> "$GITHUB_OUTPUT" | |
tekton-test: | |
needs: [check-secret, check-branch, check-change, base-image] | |
if: always() | |
uses: ./.github/workflows/tekton-test.yml | |
with: | |
base_change: ${{ needs.check-change.outputs.base }} | |
docker_secret: ${{ needs.check-secret.outputs.docker-secret }} | |
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }} | |
image_tag: ${{ needs.check-branch.outputs.tag }} | |
pipeline_name: std_v0.7 | |
integration-test-internal-only: | |
needs: [check-secret, check-branch, check-change, base-image] | |
if: always() | |
uses: ./.github/workflows/integration-test.yml | |
with: | |
base_change: ${{ needs.check-change.outputs.base }} | |
docker_secret: ${{ needs.check-secret.outputs.docker-secret }} | |
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }} | |
image_tag: ${{ needs.check-branch.outputs.tag }} | |
kepler_tag: release-0.7.7 | |
additional_opts: " TEST" | |
integration-test-with-exporter: | |
needs: [check-secret, check-branch, check-change, base-image] | |
if: always() | |
uses: ./.github/workflows/integration-test.yml | |
with: | |
base_change: ${{ needs.check-change.outputs.base }} | |
docker_secret: ${{ needs.check-secret.outputs.docker-secret }} | |
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }} | |
image_tag: ${{ needs.check-branch.outputs.tag }} | |
kepler_tag: release-0.7.7 | |
additional_opts: "" | |
integration-test-with-exporter-and-db: | |
needs: [check-secret, check-branch, check-change, base-image] | |
if: always() | |
uses: ./.github/workflows/integration-test.yml | |
with: | |
base_change: ${{ needs.check-change.outputs.base }} | |
docker_secret: ${{ needs.check-secret.outputs.docker-secret }} | |
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }} | |
image_tag: ${{ needs.check-branch.outputs.tag }} | |
kepler_tag: release-0.7.7 | |
additional_opts: " DB" | |
verify_model_training_script: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/[email protected] | |
- name: local script test | |
run: cd model_training && ./script.sh prepare_cluster | |
- name: clean up | |
run: cd model_training && ./script.sh cleanup |