Skip to content

fix: enhance cluster setup script for model training #137

fix: enhance cluster setup script for model training

fix: enhance cluster setup script for model training #137

Workflow file for this run

# Workflow on Push
# - unit test
# - build image (push if docker-secret (IMAGE_REPO, BOT_NAME, BOT_TOKEN) exists)
# - collect data (if aws-secret exists)
# - train model (if aws-secret exists)
name: Workflow on Push/PR
on:
pull_request:
paths-ignore:
- 'fig/**'
- '.github/ISSUE_TEMPLATE/**'
- '.vscode/**'
- 'LICENSE'
- '.gitignore'
- '*.md'
- '**/*.md'
env:
TAG: "v0.7.7"
jobs:
check-branch:
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.image-tag.outputs.tag }}
steps:
- uses: actions/checkout@v4
- name: Find Image Tag
id: image-tag
env:
BRANCH: ${{ github.ref_name }}
COMMIT: ${{ github.sha }}
run: |
if [ "${{ github.event_name }}" == 'pull_request' ]; then
echo "tag=pr-${{ github.event.number }}" >> "$GITHUB_OUTPUT"
else
if [ "$BRANCH" == "main" ]; then
echo "tag=${{ env.TAG }}" >> "$GITHUB_OUTPUT"
else
echo "tag=$COMMIT" >> "$GITHUB_OUTPUT"
fi
fi
check-change:
runs-on: ubuntu-latest
outputs:
base: ${{ steps.filter.outputs.base }}
data: ${{ steps.filter.outputs.data }}
modeling: ${{ steps.filter.outputs.modeling }}
s3: ${{ steps.filter.outputs.s3 }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
base:
- 'pyproject.toml'
- 'dockerfiles/Dockerfile.base'
- '.github/workflows/build-push.yml'
data:
- 'src/util/prom_types.py'
- 'src/util/train_types.py'
- 'src/train/prom/**'
- 'model_training/tekton/tasks/stressng-task.yaml'
- 'model_training/tekton/pipelines/collect.yaml'
- 'hack/**'
- '.github/workflows/collect-data-self-hosted.yml'
modeling:
- 'src/**'
- 'model_training/**'
- 'hack/**'
- '.github/workflows/collect-data-self-hosted.yml'
- '.github/workflows/train-model.yml'
s3:
- 'model_training/s3/**'
check-secret:
runs-on: ubuntu-latest
outputs:
docker-secret: ${{ steps.check-docker-secret.outputs.available }}
steps:
- name: Check Docker Secret
id: check-docker-secret
env:
DOCKER_SECRET: ${{ secrets.BOT_TOKEN}}
run: |
if [ "$DOCKER_SECRET" == "" ]; then
echo "available=false" >> "$GITHUB_OUTPUT"
else
echo "available=true" >> "$GITHUB_OUTPUT"
fi
check-base-exist:
runs-on: ubuntu-latest
outputs:
exists: ${{ steps.check-base-exist.outputs.exists }}
steps:
- name: Check if Docker base image exists
id: check-base-exist
run: |
if docker pull ${{ env.BASE_IMAGE }}; then
echo "exists=true" >> "$GITHUB_OUTPUT"
else
echo "exists=false" >> "$GITHUB_OUTPUT"
fi
unit-test:
needs: [check-change]
uses: ./.github/workflows/unit-test.yml
with:
base_change: ${{ needs.check-change.outputs.base }}
base-image:
if: ${{ (needs.check-base-exist.outputs.exists == 'false') || (needs.check-change.outputs.base == 'true') }}
needs: [check-base-exist, check-branch, check-change]
runs-on: ubuntu-latest
outputs:
change: ${{ steps.record.outputs.change }}
env:
tag: ${{ needs.check-branch.outputs.tag }}
steps:
- name: checkout
uses: actions/checkout@v4
- name: set up QEMU
uses: docker/setup-qemu-action@v3
- name: set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build base image
uses: docker/build-push-action@v6
with:
push: false
tags: ${{ vars.IMAGE_REPO || 'docker.io/library' }}/kepler_model_server_base:${{ needs.check-branch.outputs.tag }}
file: dockerfiles/Dockerfile.base
- name: Record change
id: record
run: |
echo "change=true" >> "$GITHUB_OUTPUT"
s3-image:
if: ${{ needs.check-change.outputs.s3 == 'true' }}
needs: [check-change, check-branch]
runs-on: ubuntu-latest
outputs:
change: ${{ steps.record.outputs.change }}
env:
tag: ${{ needs.check-branch.outputs.tag }}
steps:
- name: checkout
uses: actions/checkout@v4
- name: set up QEMU
uses: docker/setup-qemu-action@v3
- name: set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build s3 image
uses: docker/build-push-action@v6
with:
push: false
load: true
tags: ${{ vars.IMAGE_REPO || 'docker.io/library' }}/kepler_model_server/s3:${{ needs.check-branch.outputs.tag }}
context: model_training/s3
file: model_training/s3/Dockerfile
- name: Test s3 image
run: |
img=${{ vars.IMAGE_REPO || 'docker.io/library' }}/kepler_model_server/s3:${{ needs.check-branch.outputs.tag }}
docker run --rm $img s3-pusher --version
docker run --rm $img s3-loader --version
- name: Record change
id: record
run: |
echo "change=true" >> "$GITHUB_OUTPUT"
tekton-test:
needs: [check-secret, check-branch, check-change, base-image]
if: always()
uses: ./.github/workflows/tekton-test.yml
with:
base_change: ${{ needs.check-change.outputs.base }}
docker_secret: ${{ needs.check-secret.outputs.docker-secret }}
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }}
image_tag: ${{ needs.check-branch.outputs.tag }}
pipeline_name: std_v0.7
integration-test-internal-only:
needs: [check-secret, check-branch, check-change, base-image]
if: always()
uses: ./.github/workflows/integration-test.yml
with:
base_change: ${{ needs.check-change.outputs.base }}
docker_secret: ${{ needs.check-secret.outputs.docker-secret }}
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }}
image_tag: ${{ needs.check-branch.outputs.tag }}
kepler_tag: release-0.7.7
additional_opts: " TEST"
integration-test-with-exporter:
needs: [check-secret, check-branch, check-change, base-image]
if: always()
uses: ./.github/workflows/integration-test.yml
with:
base_change: ${{ needs.check-change.outputs.base }}
docker_secret: ${{ needs.check-secret.outputs.docker-secret }}
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }}
image_tag: ${{ needs.check-branch.outputs.tag }}
kepler_tag: release-0.7.7
additional_opts: ""
integration-test-with-exporter-and-db:
needs: [check-secret, check-branch, check-change, base-image]
if: always()
uses: ./.github/workflows/integration-test.yml
with:
base_change: ${{ needs.check-change.outputs.base }}
docker_secret: ${{ needs.check-secret.outputs.docker-secret }}
image_repo: ${{ vars.IMAGE_REPO || 'docker.io/library' }}
image_tag: ${{ needs.check-branch.outputs.tag }}
kepler_tag: release-0.7.7
additional_opts: " DB"
verify_model_training_script:
runs-on: ubuntu-latest
steps:
- uses: actions/[email protected]
- name: local script test
run: cd model_training && ./script.sh prepare_cluster
- name: clean up
run: cd model_training && ./script.sh cleanup