From fcd8d87fd7cba70e07146a31d86b789251e7a7b9 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Tue, 8 Apr 2025 14:41:56 -0400 Subject: [PATCH 01/18] add release pipeline for mlflow fix app slug fix helm setup action fix helm version add ct.yaml bump upload-artifact version fix kubeconfig retrieval bump chart testing version fix typo ci improvements and adding kots install remove conditional from create release pass license file as content pass license as content modify app slug for unstable channel fix slug move some steps to make targets add replicated helm install to pipeline fix chart testing setup fix makefile debug ci debug ci fix ci fix ci use jq to parse customer json debug ci try getting oci working with chart testing fix ct install kubeconfig offset channel name by run number in case of re-runs add more distros revert adding distros cleanup add test for mlflow fix mlflow test domain fix chart file path fix helm install stop using chart-testing fix helm install enable application tests debug remove make make sure helm dependencies get updated fix helm install bump timeout on test script and increase cluster size use kubectl port forward to access app fix port forward wait before port-forward fix authentication in test script fix test add application testing for kots install improve wait for mlfow service before starting test moving to taskfile moving to taskfile commit taskfile fix taskfile remove check deps fix get licenseid fix get licenseid fix get licenseid fix get licenseid fix get licenseid fix port forward fix port forward add missing dependency for tests refactor taskfile polish readme --- .github/workflows/mlflow-ci.yml | 571 +++++++++++++ applications/mlflow/Makefile | 57 -- applications/mlflow/README.md | 171 +++- applications/mlflow/Taskfile.yml | 804 ++++++++++++++++++ applications/mlflow/charts/mlflow/Chart.lock | 6 +- applications/mlflow/charts/mlflow/values.yaml | 50 +- .../tests/helm/nodeport-ingress-disabled.yaml | 17 + applications/mlflow/tests/mlflow_test.py | 267 ++++++ 8 files changed, 1852 insertions(+), 91 deletions(-) create mode 100644 .github/workflows/mlflow-ci.yml delete mode 100644 applications/mlflow/Makefile create mode 100644 applications/mlflow/Taskfile.yml create mode 100644 applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml create mode 100644 applications/mlflow/tests/mlflow_test.py diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml new file mode 100644 index 00000000..78d37cb4 --- /dev/null +++ b/.github/workflows/mlflow-ci.yml @@ -0,0 +1,571 @@ +name: MLflow CI + +on: + pull_request: + paths: + - 'applications/mlflow/charts/**' + - 'applications/mlflow/kots/**' + - 'applications/mlflow/tests/**' + - 'applications/mlflow/Taskfile.yml' + - '.github/workflows/mlflow-ci.yml' + push: + branches: + - main + paths: + - 'applications/mlflow/charts/**' + - 'applications/mlflow/kots/**' + - 'applications/mlflow/tests/**' + - 'applications/mlflow/Taskfile.yml' + - '.github/workflows/mlflow-ci.yml' + +env: + APP_SLUG: diamon-mlflow + +jobs: + lint-and-template: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Helm + uses: azure/setup-helm@v4.3.0 + with: + version: v3.13.3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.12 + + - name: Install Task + uses: arduino/setup-task@v1 + with: + version: 3.x + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install yq + run: | + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + chmod +x /usr/local/bin/yq + + - name: Run Lint and Template + working-directory: applications/mlflow + run: | + # Use Taskfile to add Helm repos, lint charts and generate templates + task add:repos:helm + task update:deps:helm + task lint + task template + + - name: Check Version Consistency + working-directory: applications/mlflow + run: | + # Ensure Chart.yaml and HelmChart versions are in sync + task check:versions + + create-release: + runs-on: ubuntu-22.04 + needs: [lint-and-template] + outputs: + customer-id: ${{ steps.create-customer.outputs.customer-id }} + channel-slug: ${{ steps.create-release.outputs.channel-slug }} + chart-version: ${{ steps.chart-version.outputs.chart_version }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Helm + uses: azure/setup-helm@v4.3.0 + with: + version: v3.13.3 + + - name: Install Task + uses: arduino/setup-task@v1 + with: + version: 3.x + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install yq + run: | + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + chmod +x /usr/local/bin/yq + + - name: Package and Update Versions + working-directory: applications/mlflow + run: | + # Update and package charts + task add:repos:helm + task update:deps:helm + task update:versions:chart + task package:charts + + # Extract MLflow chart version for reference + - name: Extract MLflow chart version + id: chart-version + working-directory: applications/mlflow + run: | + # Extract MLflow chart version using taskfile variable + CHART_VERSION=$(task -s extract:version:chart) + echo "chart_version=$CHART_VERSION" >> $GITHUB_OUTPUT + echo "Using MLflow chart version: $CHART_VERSION" + + - name: Create release + id: create-release + uses: replicatedhq/replicated-actions/create-release@v1.17.0 + with: + app-slug: ${{ env.APP_SLUG }} + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + yaml-dir: applications/mlflow/kots/ + promote-channel: ci-automation-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }} + version: ${{ steps.chart-version.outputs.chart_version }} + + - name: Create customer + id: create-customer + uses: replicatedhq/replicated-actions/create-customer@main + with: + app-slug: ${{ env.APP_SLUG }} + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + customer-name: automated-${{ github.run_id }} + customer-email: testcustomer@replicated.com + license-type: dev + channel-slug: ${{ steps.create-release.outputs.channel-slug }} + is-kots-install-enabled: "true" + + helm-install-test: + runs-on: ubuntu-22.04 + needs: [create-release] + strategy: + fail-fast: false + matrix: + cluster: + - distribution: kind + version: 1.32 + config: + - name: nodeport-ingress-disabled + values_file: tests/helm/nodeport-ingress-disabled.yaml + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Helm + uses: azure/setup-helm@v4.3.0 + with: + version: v3.13.3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.12 + + - name: Install Task + uses: arduino/setup-task@v1 + with: + version: 3.x + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install yq + run: | + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + chmod +x /usr/local/bin/yq + + # Install jq via apt-get + - name: Install jq + run: | + sudo apt-get update + sudo apt-get install -y jq + + # Get license ID from customer inspect + - name: Get License ID + id: get-license + working-directory: applications/mlflow + run: | + # Create directory for license + mkdir -p /tmp/replicated + + # Get customer name from previous step + CUSTOMER_NAME="${{ needs.create-release.outputs.customer-id }}" + echo "Using customer name: $CUSTOMER_NAME" + + # Get license ID using the task - capture only the last line of output + echo "Getting license ID..." + INSTALLATION_ID=$(CUSTOMER_NAME="$CUSTOMER_NAME" task get:license-id:customer | tail -n 1) + + # Check if we got a result + if [ -z "$INSTALLATION_ID" ]; then + echo "ERROR: Got empty license ID" + exit 1 + fi + + echo "License ID retrieved successfully" + echo "license_id=$INSTALLATION_ID" >> $GITHUB_OUTPUT + env: + REPLICATED_API_TOKEN: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + + - name: Create Cluster + id: create-cluster + uses: replicatedhq/replicated-actions/create-cluster@v1.17.0 + with: + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + kubernetes-distribution: ${{ matrix.cluster.distribution }} + kubernetes-version: ${{ matrix.cluster.version }} + cluster-name: mlflow-ci-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }} + disk: 100 + instance-type: r1.large + ttl: 1h + export-kubeconfig: true + + - name: Setup Namespace + working-directory: applications/mlflow + run: | + # Save kubeconfig to a file + KUBECONFIG_FILE="/tmp/kubeconfig-helm-test-${{ github.run_id }}" + echo "$KUBECONFIG" > "$KUBECONFIG_FILE" + + # Create namespace using taskfile + KUBECONFIG="$KUBECONFIG_FILE" task setup:namespaces + env: + KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + + - name: Run Helm Test + working-directory: applications/mlflow + run: | + # Save kubeconfig to a file + KUBECONFIG_FILE="/tmp/kubeconfig-helm-test-${{ github.run_id }}" + echo "$KUBECONFIG" > "$KUBECONFIG_FILE" + + # Run task to test Helm installation + KUBECONFIG="$KUBECONFIG_FILE" MLFLOW_VALUES="${{ matrix.config.values_file }}" task test:install:helm + env: + KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + REPLICATED_APP: ${{ env.APP_SLUG }} + REPLICATED_CHANNEL: ${{ needs.create-release.outputs.channel-slug }} + REPLICATED_LICENSE_ID: ${{ steps.get-license.outputs.license_id }} + TIMEOUT: 5m + WAIT_RETRIES: 30 + RETRY_INTERVAL: 10 + + - name: Run Application Tests + working-directory: applications/mlflow + run: | + # Run task to test application + task run:tests:app + env: + KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + PORT: 5000 + + - name: Install troubleshoot + run: curl -L https://github.com/replicatedhq/troubleshoot/releases/latest/download/support-bundle_linux_amd64.tar.gz | tar xzvf - + if: failure() + + - name: Collect bundle + run: | + # Save kubeconfig to a file + KUBECONFIG_FILE="/tmp/kubeconfig-helm-bundle-${{ github.run_id }}" + echo "$KUBECONFIG" > "$KUBECONFIG_FILE" + echo "Saved kubeconfig to $KUBECONFIG_FILE" + + ./support-bundle --kubeconfig="$KUBECONFIG_FILE" --interactive=false -o ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }} https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml + if: failure() + env: + KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + + - name: Upload support bundle artifact + uses: actions/upload-artifact@v4 + if: failure() + with: + name: mlflow-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }} + path: 'ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }}.tar.gz' + + - name: Remove Cluster + uses: replicatedhq/replicated-actions/remove-cluster@v1.17.0 + if: ${{ always() && steps.create-cluster.outputs.cluster-id != '' }} + with: + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + cluster-id: ${{ steps.create-cluster.outputs.cluster-id }} + + kots-install-test: + runs-on: ubuntu-22.04 + needs: [create-release] + strategy: + fail-fast: false + matrix: + cluster: + - distribution: kind + version: 1.32 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.12 + + - name: Install Task + uses: arduino/setup-task@v1 + with: + version: 3.x + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install yq + run: | + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + chmod +x /usr/local/bin/yq + + - name: Create Cluster + id: create-cluster + uses: replicatedhq/replicated-actions/create-cluster@v1.17.0 + with: + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + kubernetes-distribution: ${{ matrix.cluster.distribution }} + kubernetes-version: ${{ matrix.cluster.version }} + cluster-name: mlflow-kots-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }} + disk: 100 + instance-type: r1.large + ttl: 1h + export-kubeconfig: true + + # Download license using task + - name: Download license + id: download-license + working-directory: applications/mlflow + run: | + # Create a temporary file to store the license + mkdir -p /tmp/replicated + + # Set customer name for download + CUSTOMER_NAME="${{ needs.create-release.outputs.customer-id }}" + echo "Using customer name: $CUSTOMER_NAME for license download" + + # Try to download license + echo "Attempting to download license..." + set +e + CUSTOMER_NAME="$CUSTOMER_NAME" task download:license:customer + DOWNLOAD_RESULT=$? + set -e + + if [ $DOWNLOAD_RESULT -ne 0 ]; then + echo "Failed to download license. Error code: $DOWNLOAD_RESULT" + # Diagnostic steps + echo "Checking if license file exists..." + ls -la /tmp/replicated || true + exit 1 + fi + + # Check that license file exists and has content + if [ ! -f "/tmp/replicated/license.yaml" ] || [ ! -s "/tmp/replicated/license.yaml" ]; then + echo "License file is missing or empty!" + exit 1 + fi + + echo "License file downloaded successfully to /tmp/replicated/license.yaml" + + # Read the license and set it as an output + LICENSE_CONTENT=$(cat /tmp/replicated/license.yaml) + + # Use EOF delimiter for multi-line output + echo "license<> $GITHUB_OUTPUT + echo "$LICENSE_CONTENT" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + env: + REPLICATED_API_TOKEN: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + + # Verify license file is valid + - name: Verify License File + run: | + # Create a temporary file to check the license + echo "Saving license file for validation..." + LICENSE_FILE="/tmp/replicated/license-ci.yaml" + mkdir -p /tmp/replicated + + # Save the license content to a file + cat << 'EOF' > $LICENSE_FILE + ${{ steps.download-license.outputs.license }} + EOF + + echo "License file content (first 10 lines):" + head -n 10 $LICENSE_FILE + + # Check if the license file is valid YAML + echo "Validating license file..." + yq eval . $LICENSE_FILE > /dev/null + if [ $? -ne 0 ]; then + echo "ERROR: License file is not valid YAML" + exit 1 + else + echo "✅ License file is valid YAML" + fi + + # Install using KOTS + - name: KOTS Install + uses: replicatedhq/replicated-actions/kots-install@v1.17.0 + with: + kubeconfig: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + kots-version: latest + app-slug: ${{ env.APP_SLUG }}/${{ needs.create-release.outputs.channel-slug }} + app-version-label: ${{ needs.create-release.outputs.chart-version }} + license-file: ${{ steps.download-license.outputs.license }} + namespace: default + wait-duration: 10m + shared-password: 'replicatedmlflow' + skip-preflights: true + debug: true + + # Set up port forwarding after KOTS installation is complete + - name: Set up port forwarding + id: port-forward + run: | + # Use kubeconfig file + KUBECONFIG_FILE="/tmp/kubeconfig-kots-test-${{ github.run_id }}" + echo "$KUBECONFIG" > "$KUBECONFIG_FILE" + echo "Saved kubeconfig to $KUBECONFIG_FILE" + + # Hardcoded port 5000 for simplicity + PORT="5000" + echo "Using port: $PORT for testing" + + # Wait for the MLflow service to be created + echo "Waiting for MLflow service to be created..." + MAX_RETRIES=30 + RETRY_INTERVAL=10 + RETRY_COUNT=0 + SERVICE_FOUND=false + + while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do + echo "Check $((RETRY_COUNT+1))/$MAX_RETRIES: Looking for MLflow service..." + if KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc mlflow -n default --no-headers 2>/dev/null; then + echo "✅ MLflow service found!" + SERVICE_FOUND=true + break + else + echo "MLflow service not found yet. Waiting $RETRY_INTERVAL seconds..." + RETRY_COUNT=$((RETRY_COUNT+1)) + sleep $RETRY_INTERVAL + fi + done + + if [ "$SERVICE_FOUND" != "true" ]; then + echo "❌ ERROR: MLflow service not found after $((MAX_RETRIES * RETRY_INTERVAL)) seconds." + echo "Showing all available services in the namespace:" + KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc -n default + echo "Showing KOTS application status:" + KUBECONFIG="$KUBECONFIG_FILE" kubectl get app -n default + echo "Showing all pods in the namespace:" + KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default + exit 1 + fi + + # Verify services are present + echo "Verifying MLflow service exists..." + KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc -n default + + # Check pod status and wait for them to be running + echo "Checking pod status..." + KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default + + echo "Waiting for MLflow pods to be running..." + KUBECONFIG="$KUBECONFIG_FILE" kubectl wait --for=condition=Ready pods --selector=app.kubernetes.io/name=mlflow -n default --timeout=2m || { + echo "WARNING: Timed out waiting for pods to be ready, will try port-forwarding anyway" + KUBECONFIG="$KUBECONFIG_FILE" kubectl describe pods -n default + } + + # Set up port forwarding in the background + echo "Setting up port forwarding to run in the background" + nohup bash -c "KUBECONFIG='$KUBECONFIG_FILE' kubectl port-forward -n default svc/mlflow $PORT:5000 &>/tmp/port-forward-kots-${{ github.run_id }}.log" & + PORT_FORWARD_PID=$! + echo "port_forward_pid=$PORT_FORWARD_PID" >> $GITHUB_OUTPUT + echo "Set up port forwarding with PID: $PORT_FORWARD_PID" + + # Set hostname for testing + echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT + echo "Test endpoint will be: localhost:$PORT" + + # Give port-forward more time to establish + echo "Waiting for port-forward to establish..." + sleep 15 + + # Basic connectivity check + echo "Checking connectivity to MLflow..." + if curl -s -o /dev/null -w "%{http_code}" http://localhost:$PORT/; then + echo "Successfully connected to MLflow service!" + else + echo "Warning: Initial connection attempt failed, service may still be starting" + # Show the port-forward log for debugging + echo "Port-forward log:" + cat /tmp/port-forward-kots-${{ github.run_id }}.log || true + + # If port-forward failed, check pod logs + echo "Pod logs:" + KUBECONFIG="$KUBECONFIG_FILE" kubectl logs -n default -l app.kubernetes.io/name=mlflow --tail=20 || true + fi + env: + KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + + # Application testing with our consolidated test file + - name: Run Application Tests + working-directory: applications/mlflow + run: | + # Run task to test application + task run:tests:app + env: + KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + PORT: 5000 + + - name: Install troubleshoot + run: curl -L https://github.com/replicatedhq/troubleshoot/releases/latest/download/support-bundle_linux_amd64.tar.gz | tar xzvf - + if: failure() + + - name: Collect bundle + run: | + # Save kubeconfig to a file + KUBECONFIG_FILE="/tmp/kubeconfig-kots-bundle-${{ github.run_id }}" + echo "$KUBECONFIG" > "$KUBECONFIG_FILE" + echo "Saved kubeconfig to $KUBECONFIG_FILE" + + ./support-bundle --kubeconfig="$KUBECONFIG_FILE" --interactive=false -o kots-ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }} https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml + if: failure() + env: + KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} + + - name: Upload support bundle artifact + uses: actions/upload-artifact@v4 + if: failure() + with: + name: mlflow-kots-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }} + path: 'kots-ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}.tar.gz' + + - name: Remove Cluster + uses: replicatedhq/replicated-actions/remove-cluster@v1.17.0 + if: ${{ always() && steps.create-cluster.outputs.cluster-id != '' }} + with: + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + cluster-id: ${{ steps.create-cluster.outputs.cluster-id }} + + cleanup-test-release: + runs-on: ubuntu-22.04 + needs: [create-release, kots-install-test, helm-install-test] + if: always() + steps: + - name: Archive Customer + if: ${{ always() && needs.create-release.outputs.customer-id != '' }} + uses: replicatedhq/replicated-actions/archive-customer@v1.17.0 + with: + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + customer-id: ${{ needs.create-release.outputs.customer-id }} + + - name: Archive Channel + if: ${{ always() && needs.create-release.outputs.channel-slug != '' }} + uses: replicatedhq/replicated-actions/archive-channel@v1.17.0 + with: + app-slug: ${{ env.APP_SLUG }} + api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} + channel-slug: ${{ needs.create-release.outputs.channel-slug }} diff --git a/applications/mlflow/Makefile b/applications/mlflow/Makefile deleted file mode 100644 index 404c02b7..00000000 --- a/applications/mlflow/Makefile +++ /dev/null @@ -1,57 +0,0 @@ -manifests_dir := $(shell pwd)/kots -chart_archives := $(wildcard $(manifests_dir)/*.tgz) - -ARGS = $(filter-out $@,$(MAKECMDGOALS)) -%: - @: - -SHELL := /bin/bash -.SHELLFLAGS = -x +u - -# Define the base path to your Helm charts directory -HELM_CHARTS_DIR = ./charts - -# Define the path to your target KOTS directory -KOTS_DIR = ./kots - -# Define the function to extract chartVersion -define get_kots_chart_version - grep 'chartVersion:' $(1) | sed 's/.*chartVersion: //' -endef - -# Function to get chart version -define get_helm_chart_version - helm show chart $(1) | grep '^version:' | cut -d ' ' -f 2 -endef - -# Target to package charts and update versions -.PHONY: package-and-update -package-and-update: add-helm-repositories - @for chart in $(HELM_CHARTS_DIR)/*; do \ - echo "Packaging $$chart"; \ - helm package $$chart -u -d $(KOTS_DIR); \ - version=$$(eval $(call get_helm_chart_version,$$chart)); \ - chart_name=$$(basename $$chart); \ - echo "Updating version to $$version in $(KOTS_DIR)/*-chart.yaml"; \ - sed -i '' 's|chartVersion: [0-9a-zA-Z.-]*|chartVersion: '$$version'|g' $(KOTS_DIR)/$$chart_name-chart.yaml; \ - done - -# Target to add Helm repositories -.PHONY: add-helm-repositories -add-helm-repositories: - @helm repo add cnpg https://cloudnative-pg.github.io/charts - @helm repo add minio https://operator.min.io/ - -# Example target to check versions (optional) -.PHONY: check-versions -check-versions: - @for chart_dir in $(HELM_CHARTS_DIR)/*; do \ - if [ -d $$chart_dir ]; then \ - version=$$(eval $(call get_helm_chart_version,$$chart_dir)); \ - echo "$$chart_dir version: $$version"; \ - fi \ - done - -.PHONY: clean-charts -clean-charts: - rm -f $(KOTS_DIR)/*.tgz diff --git a/applications/mlflow/README.md b/applications/mlflow/README.md index 1b86bcc7..4863b9b7 100644 --- a/applications/mlflow/README.md +++ b/applications/mlflow/README.md @@ -1,11 +1,170 @@ -# Mlflow +# MLflow -## Get Started +MLflow is an open-source platform for managing the machine learning lifecycle, including experimentation, reproducibility, deployment, and a central model registry. This application provides a Helm-based deployment of MLflow with support for Replicated distribution. -## Helm installs +## Development -## KOTS Existing Cluster +The MLflow application includes a Taskfile.yml that provides tasks for developing, testing, and publishing the application. -## Embedded Cluster +### Prerequisites -## Running the Mlflow Quickstart Example +- [Task](https://taskfile.dev/#/installation) command line tool +- Kubernetes cluster configured in your current context +- kubectl, helm, and python3 installed + +### Development Workflow + +Follow this workflow for development: + +1. Add required Helm repositories and update dependencies: + ```bash + task add:repos:helm + task update:deps:helm + ``` + +2. Lint charts to check for issues: + ```bash + task lint + ``` + +3. Template charts to verify the rendered manifests: + ```bash + task template + ``` + +4. Install charts for development: + ```bash + # Installs with Replicated SDK disabled + task install:helm:local + + # Optionally specify a custom values file + MLFLOW_VALUES=./my-values.yaml task install:helm:local + ``` + + > **Note:** For local development, the Replicated SDK is explicitly disabled (`replicated.enabled=false`). This allows development without requiring access to the Replicated platform. + > + > This task automatically sets up port forwarding from localhost:5000 to the MLflow service in the cluster, making the application available for testing. + +5. Run application tests: + ```bash + task run:tests:app + ``` + +6. Make changes to your charts and repeat steps 2-5 as needed + +This workflow allows rapid iteration without needing to publish to the Replicated registry. + +### Task Reference + +Tasks follow a `verb:resource[:subresource]` naming convention for clarity: + +```bash +# Validation and verification +task lint # Lint Helm charts +task template # Render templates to stdout (SDK disabled) +task check:versions # Verify Chart.yaml and KOTS manifest versions match + +# Repository and dependency management +task add:repos:helm # Add required Helm repositories +task update:deps:helm # Update Helm chart dependencies + +# Packaging and versioning +task update:versions:chart # Update chart version refs in KOTS manifests +task package:charts # Package Helm charts for distribution +task extract:version:chart # Extract current MLflow chart version + +# Installation +task install:helm:local # Install charts for local development (SDK disabled) + +# Testing +task test:install:helm # Test with charts from Replicated registry +task test:install:kots # Test KOTS installation +task run:tests:app # Run application tests against running MLflow +task run:tests:all # Run all tests (Helm install + app tests) + +# Release management +task create:release # Create a Replicated release + +# Cleanup +task clean:files:charts # Clean packaged chart files +task clean:all # Clean all generated files +``` + +### Publishing Replicated Releases + +When you're ready to publish your changes to the Replicated platform: + +1. Set up the required environment variables: + ```bash + # Replicated API token for authentication + export REPLICATED_API_TOKEN=your_api_token + + # App and channel to publish to + export REPLICATED_APP=app_slug + export REPLICATED_CHANNEL=channel_name + ``` + +2. Package the charts and update version references: + ```bash + # This updates KOTS manifests with the current chart versions + # and packages the charts as .tgz files + task package:charts + ``` + +3. Create a release in Replicated: + ```bash + # This uploads the packaged charts and creates a new release + task create:release + ``` + +4. Verify the release was created successfully in the Replicated vendor portal + +### Testing Replicated Releases + +This workflow tests the full Replicated release and distribution process: + +1. After publishing a release, login to the registry with a license ID: + ```bash + # Set license ID for registry authentication + export REPLICATED_LICENSE_ID=your_license_id + export REPLICATED_APP=app_slug + export REPLICATED_CHANNEL=channel_name + + # Login to the registry + task login:registry + ``` + +2. Test the Helm installation from the Replicated registry: + ```bash + # This pulls charts from the Replicated registry with SDK enabled + task test:install:helm + ``` + +3. Verify the installation with application tests: + ```bash + task run:tests:app + ``` + +This workflow validates the entire release pipeline from publishing to installation, ensuring that your charts work correctly when distributed through the Replicated platform. + +## CI/CD Pipeline + +This application includes a CI/CD pipeline implemented with GitHub Actions. The pipeline handles: + +- Linting and validating Helm chart templates +- Creating releases in Replicated +- Testing Helm installation with charts from the Replicated registry +- Installing the application via KOTS + +The pipeline workflow: +1. `lint-and-template`: Validates chart syntax and templates (SDK disabled) +2. `create-release`: Packages charts and creates a release in Replicated +3. `helm-install-test`: Tests Helm installation with charts from Replicated registry (SDK enabled) +4. `kots-install-test`: Tests KOTS installation +5. `cleanup-test-release`: Cleans up test resources + +The pipeline is triggered on: +- Pull requests affecting the MLflow application +- Pushes to the main branch + +For more details, see the workflow definition in [.github/workflows/mlflow-ci.yml](../../.github/workflows/mlflow-ci.yml). diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml new file mode 100644 index 00000000..2a6c4d21 --- /dev/null +++ b/applications/mlflow/Taskfile.yml @@ -0,0 +1,804 @@ +version: '3' + +# MLflow CI and Local Testing Taskfile +# This file centralizes all testing and CI tasks for the MLflow application + +# Common variables +vars: + # Directory structure + CHART_DIR: ./charts + KOTS_DIR: ./kots + TESTS_DIR: ./tests + + # Testing configuration + NAMESPACE: mlflow + PORT: 5000 + + # Chart configuration + CHARTS: mlflow infra + + # Environment detection + CI: + sh: echo "${CI:-false}" + + # Resource-related parameters (adjustable for local/CI environments) + TIMEOUT: '{{if eq .CI "true"}}5m{{else}}3m{{end}}' + WAIT_RETRIES: '{{if eq .CI "true"}}30{{else}}15{{end}}' + RETRY_INTERVAL: '10' + + # Helm chart versions (dynamically determined) + MLFLOW_VERSION: + sh: helm show chart ./charts/mlflow | grep '^version:' | cut -d ' ' -f 2 + INFRA_VERSION: + sh: helm show chart ./charts/infra | grep '^version:' | cut -d ' ' -f 2 + + # Release configuration + APP_NAME: diamon-mlflow + YAML_DIR: "./kots" + +# Default task shows help +tasks: + default: + desc: Show help information about available tasks + cmds: + - echo "MLflow CI and Testing Tasks" + - echo "==========================" + - task --list + silent: true + + # Version checking task + check:versions: + desc: Check if versions match between Chart.yaml files and HelmChart manifests + cmds: + - echo "Checking chart versions consistency..." + - | + # For each available chart, check that versions match + all_match=true + + for chart in {{.CHARTS}}; do + echo "Checking $chart chart..." + + # Get the Chart.yaml path + chart_yaml="{{.CHART_DIR}}/$chart/Chart.yaml" + + # Get the HelmChart resource path + helmchart="{{.KOTS_DIR}}/$chart-chart.yaml" + + # Check if both files exist + if [ ! -f "$chart_yaml" ]; then + echo "❌ Chart.yaml not found: $chart_yaml" + all_match=false + continue + fi + + if [ ! -f "$helmchart" ]; then + echo "❌ HelmChart resource not found: $helmchart" + all_match=false + continue + fi + + # Get versions from both files + chart_yaml_version=$(yq e '.version' "$chart_yaml") + helmchart_version=$(yq e '.spec.chart.chartVersion' "$helmchart") + + # Check if versions could be extracted + if [ -z "$chart_yaml_version" ]; then + echo "❌ Could not extract version from Chart.yaml: $chart_yaml" + all_match=false + continue + fi + + if [ -z "$helmchart_version" ]; then + echo "❌ Could not extract chartVersion from HelmChart: $helmchart" + all_match=false + continue + fi + + # Compare versions + if [ "$chart_yaml_version" = "$helmchart_version" ]; then + echo "✅ Versions match for $chart: $chart_yaml_version" + else + echo "❌ Version mismatch for $chart:" + echo " Chart.yaml version: $chart_yaml_version" + echo " HelmChart version: $helmchart_version" + all_match=false + fi + done + + # Exit with error if any versions don't match + if [ "$all_match" = true ]; then + echo "✅ All chart versions match between Chart.yaml and HelmChart resources." + else + echo "❌ Version mismatches found! Please run 'task update:versions' to synchronize them." + exit 1 + fi + + # Repository setup - renamed + add:repos:helm: + desc: Add required Helm repositories + cmds: + - helm repo add cnpg https://cloudnative-pg.github.io/charts + - helm repo add minio https://operator.min.io/ + - helm repo update + + # Dependency update - renamed + update:deps:helm: + desc: Update Helm chart dependencies + deps: [add:repos:helm] + cmds: + - echo "Updating Helm chart dependencies..." + - for: { var: CHARTS } + cmd: | + echo "Updating dependencies for {{.ITEM}} chart..." + helm dependency update {{.CHART_DIR}}/{{.ITEM}} + - echo "Helm chart dependencies updated successfully." + + # Chart linting + lint: + desc: Lint Helm charts + deps: [add:repos:helm, update:deps:helm] + cmds: + - echo "Linting Helm charts..." + - for: { var: CHARTS } + cmd: | + echo "Linting {{.ITEM}} chart..." + helm lint {{.CHART_DIR}}/{{.ITEM}} + - echo "Linting completed successfully." + + # Template rendering + template: + desc: Template Helm charts with Replicated SDK disabled and output to stdout + deps: [add:repos:helm, update:deps:helm] + cmds: + - echo "Templating Helm charts with Replicated SDK disabled..." + - for: { var: CHARTS } + cmd: | + echo "=== Rendering templates for {{.ITEM}} chart ===" + echo "===============================================" + helm template {{.CHART_DIR}}/{{.ITEM}} --debug + echo "" + echo "=== End of templates for {{.ITEM}} chart ===" + echo "" + - echo "All chart templates have been output to stdout." + + # Version update for packaged charts + update:versions:chart: + desc: Update chart version references in KOTS manifests + cmds: + - for: { var: CHARTS } + cmd: | + sed -i 's|chartVersion: [0-9a-zA-Z.-]*|chartVersion: {{if eq .ITEM "mlflow"}}{{.MLFLOW_VERSION}}{{else}}{{.INFRA_VERSION}}{{end}}|g' {{.KOTS_DIR}}/{{.ITEM}}-chart.yaml + - echo "Chart versions updated in KOTS manifests." + - cmd: task check:versions || echo "⚠️ Version check failed after update. Please verify manually." + + # Packaging tasks + package:charts: + desc: Package Helm charts for distribution + deps: [add:repos:helm, update:deps:helm, update:versions:chart] + cmds: + - echo "Packaging Helm charts..." + - for: { var: CHARTS } + cmd: | + echo "Packaging {{.ITEM}} chart..." + helm package {{.CHART_DIR}}/{{.ITEM}} -u -d {{.KOTS_DIR}} + - echo "Charts packaged successfully in {{.KOTS_DIR}} directory." + + # Release creation + create:release: + desc: Create a release in Replicated + deps: [check:versions, package:charts] + vars: + VERSION: '{{.VERSION | default .MLFLOW_VERSION}}' + REPLICATED_CHANNEL: '{{.REPLICATED_CHANNEL | default ""}}' + cmds: + - echo "Creating release version {{.VERSION}} for app {{.APP_NAME}}..." + - | + if [ -z "{{.REPLICATED_CHANNEL}}" ]; then + echo "❌ Error: No channel specified. Please provide a channel with REPLICATED_CHANNEL=your-channel-name" + exit 1 + fi + + echo "Creating release for app {{.APP_NAME}} in channel {{.REPLICATED_CHANNEL}} with version {{.VERSION}}" + replicated release create \ + --yaml-dir {{.YAML_DIR}} \ + --promote {{.REPLICATED_CHANNEL}} \ + --ensure-channel \ + --version {{.VERSION}} \ + --release-notes "{{.RELEASE_NOTES}}" \ + --app {{.APP_NAME}} + + if [ $? -eq 0 ]; then + echo "✅ Release {{.VERSION}} created successfully and promoted to channel {{.REPLICATED_CHANNEL}}" + else + echo "❌ Failed to create release" + exit 1 + fi + + # Namespace setup + setup:namespaces: + desc: Create and setup required namespaces + cmds: + - echo "Setting up required namespaces..." + - kubectl create namespace {{.NAMESPACE}} --dry-run=client -o yaml | kubectl apply -f - + - echo "Namespace setup complete" + + # Registry authentication/setup + login:registry: + desc: Login to Replicated registry (requires REPLICATED_LICENSE_ID env var) + cmds: + - echo "Authenticating with Replicated registry..." + - | + if [ -z "$REPLICATED_LICENSE_ID" ]; then + echo "ERROR: REPLICATED_LICENSE_ID environment variable must be set" + exit 1 + fi + helm registry login registry.replicated.com \ + --username="$REPLICATED_LICENSE_ID" \ + --password="$REPLICATED_LICENSE_ID" + - echo "Registry login successful." + + # Customer license ID retrieval + get:license-id:customer: + desc: Extract license ID from a customer (requires REPLICATED_API_TOKEN and customer name) + cmds: + - echo "Extracting license ID for customer {{.CUSTOMER_NAME}}..." + - | + # Validate required environment variables + if [ -z "$REPLICATED_API_TOKEN" ]; then + echo "ERROR: REPLICATED_API_TOKEN environment variable must be set" + exit 1 + fi + + if [ -z "{{.CUSTOMER_NAME}}" ]; then + echo "ERROR: CUSTOMER_NAME parameter is required" + exit 1 + fi + + # Run vendor-cli to inspect the customer and get the installation ID as JSON + echo "Running vendor-cli to inspect customer..." + CUSTOMER_JSON=$(docker run --rm \ + -e REPLICATED_API_TOKEN=$REPLICATED_API_TOKEN \ + -e REPLICATED_APP={{.APP_NAME}} \ + replicated/vendor-cli:latest \ + customer inspect --customer "{{.CUSTOMER_NAME}}" --output json) + + # Use jq to properly extract the installationId + INSTALLATION_ID=$(echo "$CUSTOMER_JSON" | jq -r '.installationId') + + # Check if we got a valid ID + if [ -z "$INSTALLATION_ID" ] || [ "$INSTALLATION_ID" = "null" ]; then + echo "Failed to extract installationId from customer JSON" + echo "JSON structure:" + echo "$CUSTOMER_JSON" | jq 'del(.installationId)' # Print JSON without the license ID + exit 1 + fi + + # Print the license ID so it can be captured + echo "$INSTALLATION_ID" + + # Download customer license + download:license:customer: + desc: Download license for a customer (requires REPLICATED_API_TOKEN and customer name) + cmds: + - echo "Downloading license for customer {{.CUSTOMER_NAME}}..." + - | + # Validate required environment variables + if [ -z "$REPLICATED_API_TOKEN" ]; then + echo "ERROR: REPLICATED_API_TOKEN environment variable must be set" + exit 1 + fi + + if [ -z "{{.CUSTOMER_NAME}}" ]; then + echo "ERROR: CUSTOMER_NAME parameter is required" + exit 1 + fi + + # Create a temporary directory for the license if it doesn't exist + mkdir -p /tmp/replicated + OUTPUT_FILE="/tmp/replicated/license-download-output.txt" + LICENSE_FILE="/tmp/replicated/license.yaml" + + # Run vendor-cli to download the customer license to a temporary file first + echo "Running vendor-cli to download license..." + TMP_LICENSE_FILE=$(mktemp) + set +e + docker run --rm \ + -e REPLICATED_API_TOKEN=$REPLICATED_API_TOKEN \ + -e REPLICATED_APP={{.APP_NAME}} \ + replicated/vendor-cli:latest \ + customer download-license --customer "{{.CUSTOMER_NAME}}" > "$TMP_LICENSE_FILE" 2>$OUTPUT_FILE + DOWNLOAD_EXIT_CODE=$? + set -e + + if [ $DOWNLOAD_EXIT_CODE -ne 0 ]; then + echo "ERROR: Failed to download license for customer {{.CUSTOMER_NAME}}" + echo "Error output:" + cat $OUTPUT_FILE + rm -f $OUTPUT_FILE "$TMP_LICENSE_FILE" + exit 1 + fi + + # Check if the file is empty + if [ ! -s "$TMP_LICENSE_FILE" ]; then + echo "ERROR: Downloaded license file is empty" + cat $OUTPUT_FILE + rm -f $OUTPUT_FILE "$TMP_LICENSE_FILE" + exit 1 + fi + + # Verify the license file is valid YAML + if command -v yq >/dev/null 2>&1; then + echo "Validating license file is proper YAML..." + if ! yq eval . "$TMP_LICENSE_FILE" > /dev/null 2>&1; then + echo "ERROR: Downloaded license file is not valid YAML" + echo "License file content:" + cat "$TMP_LICENSE_FILE" + rm -f $OUTPUT_FILE "$TMP_LICENSE_FILE" + exit 1 + fi + else + echo "WARNING: yq not found, skipping YAML validation" + fi + + # Remove any extra output or text before the YAML content + # This extracts content between first '---' and the end of file + if grep -q "^---" "$TMP_LICENSE_FILE"; then + echo "License appears to be in YAML format with document marker, extracting YAML content..." + sed -n '/^---/,$p' "$TMP_LICENSE_FILE" > "$LICENSE_FILE" + else + # If no '---' marker is found, check for '{' to identify JSON + if grep -q "{" "$TMP_LICENSE_FILE"; then + echo "License appears to be in JSON format, converting to YAML..." + if command -v yq >/dev/null 2>&1; then + cat "$TMP_LICENSE_FILE" | yq eval -P > "$LICENSE_FILE" + else + echo "ERROR: Cannot convert JSON to YAML without yq" + cat "$TMP_LICENSE_FILE" + rm -f $OUTPUT_FILE "$TMP_LICENSE_FILE" + exit 1 + fi + else + # If neither YAML nor JSON markers are found, just copy the file + echo "No YAML document marker or JSON found. Copying file as-is..." + cat "$TMP_LICENSE_FILE" > "$LICENSE_FILE" + fi + fi + + # Log some debug information + echo "License file content (first 5 lines):" + head -n 5 "$LICENSE_FILE" + + # Verify file exists and has content + if [ ! -s "$LICENSE_FILE" ]; then + echo "ERROR: Final license file is empty after processing" + rm -f $OUTPUT_FILE "$TMP_LICENSE_FILE" + exit 1 + fi + + echo "License successfully downloaded to $LICENSE_FILE" + rm -f $OUTPUT_FILE "$TMP_LICENSE_FILE" + + # Cleanup tasks + clean:files:charts: + desc: Clean packaged charts from KOTS directory + cmds: + - rm -f {{.KOTS_DIR}}/*.tgz + - echo "Chart packages cleaned from {{.KOTS_DIR}}" + + # Main clean task + clean:all: + desc: Clean all generated files + deps: [clean:files:charts] + cmds: + - echo "All generated files cleaned successfully" + + # Helm test task + test:install:helm: + desc: Run Helm installation test from Replicated registry + deps: [login:registry, setup:namespaces] + cmds: + - echo "Running Helm installation test with custom values..." + - | + # Determine OCI URL - prefer direct OCI_URL if provided, otherwise construct from app/channel + if [ -n "$OCI_URL" ]; then + echo "Using provided OCI URL: $OCI_URL" + else + echo "No direct OCI_URL provided. Constructing from REPLICATED_APP and REPLICATED_CHANNEL" + echo "Note: This requires REPLICATED_APP and REPLICATED_CHANNEL env vars." + if [ -z "$REPLICATED_APP" ] || [ -z "$REPLICATED_CHANNEL" ]; then + echo "ERROR: REPLICATED_APP and REPLICATED_CHANNEL must be set" + exit 1 + fi + OCI_URL="oci://registry.replicated.com/$REPLICATED_APP/$REPLICATED_CHANNEL" + echo "Constructed OCI URL: $OCI_URL" + fi + + # Validate OCI_URL is set and not empty + if [ -z "$OCI_URL" ]; then + echo "ERROR: OCI_URL is empty. Check that REPLICATED_APP and REPLICATED_CHANNEL are correctly set." + echo "REPLICATED_APP=$REPLICATED_APP" + echo "REPLICATED_CHANNEL=$REPLICATED_CHANNEL" + exit 1 + fi + + # Prepare values arguments if provided + MLFLOW_VALUES_ARGS="" + if [ -n "$MLFLOW_VALUES" ]; then + echo "Using MLflow values file: $MLFLOW_VALUES" + # Check if values file exists + if [ ! -f "$MLFLOW_VALUES" ]; then + echo "ERROR: Values file '$MLFLOW_VALUES' does not exist" + exit 1 + fi + MLFLOW_VALUES_ARGS="--values $MLFLOW_VALUES" + echo "Values args: $MLFLOW_VALUES_ARGS" + else + echo "No custom values file provided. Using default values." + fi + + # Install infra chart from Replicated registry + echo "Installing infra chart from Replicated registry..." + echo "Chart path: $OCI_URL/infra" + helm upgrade --install infra $OCI_URL/infra \ + --namespace {{.NAMESPACE}} \ + --wait --timeout {{.TIMEOUT}} --debug || { + echo "ERROR: Failed to install infra chart from $OCI_URL/infra" + echo "Please check that registry login was successful and the chart exists in the registry." + exit 1 + } + + # Install MLflow chart from Replicated registry with custom values + echo "Installing mlflow chart from Replicated registry with custom values..." + echo "Chart path: $OCI_URL/mlflow" + echo "Using values args: $MLFLOW_VALUES_ARGS" + helm upgrade --install mlflow $OCI_URL/mlflow \ + --namespace {{.NAMESPACE}} \ + $MLFLOW_VALUES_ARGS \ + --wait --timeout {{.TIMEOUT}} --debug || { + echo "ERROR: Failed to install mlflow chart from $OCI_URL/mlflow" + echo "Please check that registry login was successful and the chart exists in the registry." + exit 1 + } + + echo "Helm installation with custom values completed successfully." + - task: forward:port + + # KOTS test task + test:install:kots: + desc: Run KOTS installation test + deps: [setup:namespaces] + cmds: + - echo "Running KOTS installation test..." + - | + if [ -z "$REPLICATED_LICENSE_ID" ]; then + echo "ERROR: REPLICATED_LICENSE_ID environment variable must be set" + exit 1 + fi + + if [ -z "$REPLICATED_APP" ]; then + echo "ERROR: REPLICATED_APP environment variable must be set" + exit 1 + fi + + if [ -z "$REPLICATED_CHANNEL" ]; then + echo "ERROR: REPLICATED_CHANNEL environment variable must be set" + exit 1 + fi + + # Create directory for license file if it doesn't exist + mkdir -p /tmp/replicated + LICENSE_FILE="/tmp/replicated/license.yaml" + + # Validate license file exists and has content + if [ ! -f "$LICENSE_FILE" ] || [ ! -s "$LICENSE_FILE" ]; then + echo "ERROR: License file does not exist or is empty at $LICENSE_FILE" + echo "Please download the license file using the customer:download-license task first" + exit 1 + fi + + # Verify license file is valid YAML + if command -v yq >/dev/null 2>&1; then + echo "Validating license file is proper YAML before installation..." + if ! yq eval . "$LICENSE_FILE" > /dev/null 2>&1; then + echo "ERROR: License file is not valid YAML" + echo "License file content:" + cat "$LICENSE_FILE" | head -n 10 + exit 1 + else + echo "✅ License file is valid YAML" + fi + else + echo "WARNING: yq not found, skipping YAML validation" + fi + + echo "Installing latest KOTS version..." + curl https://kots.io/install | bash + + echo "License file at $LICENSE_FILE (first 5 lines):" + head -n 5 "$LICENSE_FILE" + + echo "Installing application from Replicated..." + echo "App: $REPLICATED_APP" + echo "Channel: $REPLICATED_CHANNEL" + echo "Using license file: $LICENSE_FILE" + + # Run KOTS install with detailed output + set -x + kubectl kots install $REPLICATED_APP/$REPLICATED_CHANNEL \ + --shared-password=replicatedmlflow \ + --license-file="$LICENSE_FILE" \ + --namespace=default \ + --wait-duration=10m \ + --skip-preflights + set +x + + # Check if installation succeeded + if [ $? -ne 0 ]; then + echo "❌ KOTS installation failed" + echo "Checking app status:" + kubectl get app -n default + echo "Checking pods:" + kubectl get pods -n default + echo "Checking pod logs:" + kubectl logs -n default -l app=kotsadm --tail=50 + exit 1 + fi + + echo "✅ KOTS installation completed. Setting up port forwarding for testing..." + - task: forward:port + + # Port forwarding task + forward:port: + desc: Setup port forwarding to MLflow service for testing + internal: true + cmds: + - echo "Setting up port forwarding to MLflow service..." + - | + # Wait for the MLflow service to be created + echo "Waiting for MLflow service to be created..." + MAX_RETRIES={{.WAIT_RETRIES}} + RETRY_INTERVAL={{.RETRY_INTERVAL}} + RETRY_COUNT=0 + SERVICE_FOUND=false + + while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do + echo "Check $((RETRY_COUNT+1))/$MAX_RETRIES: Looking for MLflow service..." + if kubectl get svc mlflow -n {{.NAMESPACE}} --no-headers 2>/dev/null; then + echo "✅ MLflow service found!" + SERVICE_FOUND=true + break + else + echo "MLflow service not found yet. Waiting $RETRY_INTERVAL seconds..." + RETRY_COUNT=$((RETRY_COUNT+1)) + sleep $RETRY_INTERVAL + fi + done + + if [ "$SERVICE_FOUND" != "true" ]; then + echo "❌ ERROR: MLflow service not found after $((MAX_RETRIES * RETRY_INTERVAL)) seconds." + echo "Showing all available services in the namespace:" + kubectl get svc -n {{.NAMESPACE}} + echo "Showing pod status in the namespace:" + kubectl get pods -n {{.NAMESPACE}} + echo "Showing pod details:" + kubectl describe pods -n {{.NAMESPACE}} -l app.kubernetes.io/name=mlflow + exit 1 + fi + + # Verify the services are present + echo "Verifying MLflow service exists..." + kubectl get svc -n {{.NAMESPACE}} + + # Check pod status and wait for them to be running + echo "Checking pod status..." + kubectl get pods -n {{.NAMESPACE}} + + echo "Waiting for MLflow pods to be running..." + kubectl wait --for=condition=Ready pods --selector=app.kubernetes.io/name=mlflow -n {{.NAMESPACE}} --timeout={{.TIMEOUT}} || { + echo "WARNING: Timed out waiting for pods to be ready, will try port-forwarding anyway" + kubectl describe pods -n {{.NAMESPACE}} + } + + SERVICE_NAME=$(kubectl get svc -n {{.NAMESPACE}} -l app.kubernetes.io/name=mlflow -o name | head -n 1) + if [ -z "$SERVICE_NAME" ]; then + echo "ERROR: Could not find MLflow service with label app.kubernetes.io/name=mlflow" + exit 1 + fi + + echo "Setting up port forwarding to $SERVICE_NAME..." + # Set up port forwarding in the background with logs + echo "Setting up port forwarding using nohup..." + # Use nohup to ensure the process runs in the background even if the parent process exits + PORT_FORWARD_LOG="/tmp/port-forward-mlflow-$$.log" + nohup kubectl port-forward -n {{.NAMESPACE}} $SERVICE_NAME {{.PORT}}:5000 > $PORT_FORWARD_LOG 2>&1 & + PORT_FORWARD_PID=$! + + # Give port-forward a moment to start + sleep 2 + + # Verify the PID was captured properly + if [ -z "$PORT_FORWARD_PID" ] || [ "$PORT_FORWARD_PID" = "0" ]; then + echo "ERROR: Failed to capture port-forward process PID" + echo "Attempting alternate port forwarding method..." + + # Alternative approach - use a fixed port file to track the PID + PID_FILE="/tmp/mlflow-portforward.pid" + rm -f $PID_FILE + + # Use a background task with PID file + ( + kubectl port-forward -n {{.NAMESPACE}} $SERVICE_NAME {{.PORT}}:5000 > $PORT_FORWARD_LOG 2>&1 & + echo $! > $PID_FILE + wait + ) & + + # Wait a moment and check if the PID file was created + sleep 3 + if [ -f $PID_FILE ]; then + PORT_FORWARD_PID=$(cat $PID_FILE) + echo "Port forwarding set up with alternate method, PID: $PORT_FORWARD_PID" + else + echo "ERROR: Both port forwarding methods failed. Continuing anyway..." + # Continue anyway and rely on curl checks to verify connectivity + PORT_FORWARD_PID="" + fi + else + echo "Port forwarding set up with PID: $PORT_FORWARD_PID" + fi + + # Give port-forward more time to establish + echo "Waiting for port-forward to establish..." + sleep 5 + + # Only check process if we have a PID + if [ -n "$PORT_FORWARD_PID" ]; then + # Check if port-forward process is still running + if ! ps -p $PORT_FORWARD_PID > /dev/null 2>&1; then + echo "WARNING: Port forwarding process with PID $PORT_FORWARD_PID is not running" + echo "Port forwarding log:" + cat $PORT_FORWARD_LOG || echo "No log file found" + echo "Will try to connect anyway..." + fi + fi + + # Check if port-forward is still running + if [ -n "$PORT_FORWARD_PID" ] && ! ps -p $PORT_FORWARD_PID > /dev/null 2>&1; then + echo "ERROR: Port forwarding process died during connection attempts." + echo "Port forwarding log:" + cat $PORT_FORWARD_LOG || echo "No log file found" + + # Restart port forwarding as a fallback + echo "Attempting to restart port forwarding..." + nohup kubectl port-forward -n {{.NAMESPACE}} $SERVICE_NAME {{.PORT}}:5000 > $PORT_FORWARD_LOG 2>&1 & + PORT_FORWARD_PID=$! + sleep 3 + + if [ -z "$PORT_FORWARD_PID" ] || [ "$PORT_FORWARD_PID" = "0" ]; then + echo "WARNING: Failed to capture restarted port-forward process PID" + echo "Will continue without checking process status" + else + echo "Restarted port forwarding with PID: $PORT_FORWARD_PID" + fi + + sleep 5 # Give the new port-forward time to establish + fi + + # Basic connectivity check + echo "Checking connectivity to MLflow on localhost:{{.PORT}}..." + MAX_CONN_RETRIES=5 + CONN_RETRY_COUNT=0 + CONN_SUCCESS=false + + while [ $CONN_RETRY_COUNT -lt $MAX_CONN_RETRIES ]; do + CONN_RETRY_COUNT=$((CONN_RETRY_COUNT+1)) + echo "Connection attempt $CONN_RETRY_COUNT/$MAX_CONN_RETRIES..." + + # Try curling the MLflow endpoint + if curl -s -o /dev/null -w "%{http_code}" http://localhost:{{.PORT}}/ > /dev/null 2>&1; then + echo "Successfully connected to MLflow service!" + CONN_SUCCESS=true + break + else + echo "Connection attempt $CONN_RETRY_COUNT failed, retrying in 5 seconds..." + + # Check if port-forward is still running + if [ -n "$PORT_FORWARD_PID" ] && ! ps -p $PORT_FORWARD_PID > /dev/null 2>&1; then + echo "WARNING: Port forwarding process with PID $PORT_FORWARD_PID is not running" + echo "Port forwarding log:" + cat $PORT_FORWARD_LOG || echo "No log file found" + echo "Will try to connect anyway..." + fi + + sleep 5 + fi + done + + if [ "$CONN_SUCCESS" != "true" ]; then + echo "WARNING: Could not connect to MLflow service after $MAX_CONN_RETRIES attempts." + echo "This may indicate issues with the service or port forwarding." + echo "Port forwarding log:" + cat $PORT_FORWARD_LOG + echo "Pod logs:" + kubectl logs -n {{.NAMESPACE}} -l app.kubernetes.io/name=mlflow --tail=20 || true + echo "Continuing anyway, but tests may fail." + fi + + echo "Port forwarding setup completed." + + # Local installation task (renamed from test:install:local) + install:helm:local: + desc: Install MLflow with local Helm charts for development (with Replicated SDK disabled) + deps: [add:repos:helm, setup:namespaces] + cmds: + - echo "Installing MLflow with local Helm charts (Replicated SDK disabled)..." + - | + # Prepare values arguments if provided + MLFLOW_VALUES_ARGS="" + if [ -n "$MLFLOW_VALUES" ]; then + echo "Using MLflow values file: $MLFLOW_VALUES" + # Check if values file exists + if [ ! -f "$MLFLOW_VALUES" ]; then + echo "ERROR: Values file '$MLFLOW_VALUES' does not exist" + exit 1 + fi + MLFLOW_VALUES_ARGS="--values $MLFLOW_VALUES" + echo "Values args: $MLFLOW_VALUES_ARGS" + else + echo "No custom values file provided. Using default values." + fi + + # Install infra chart from local directory + echo "Installing infra chart from local directory..." + helm upgrade --install infra {{.CHART_DIR}}/infra \ + --namespace {{.NAMESPACE}} \ + --wait --timeout {{.TIMEOUT}} --debug || { + echo "ERROR: Failed to install infra chart from {{.CHART_DIR}}/infra" + exit 1 + } + + # Install MLflow chart from local directory with custom values + # Note: We explicitly disable the Replicated SDK for local development to avoid + # dependencies on the Replicated platform during development + echo "Installing mlflow chart from local directory (Replicated SDK disabled)..." + echo "Using values args: $MLFLOW_VALUES_ARGS" + helm upgrade --install mlflow {{.CHART_DIR}}/mlflow \ + --namespace {{.NAMESPACE}} \ + --set replicated.enabled=false \ + $MLFLOW_VALUES_ARGS \ + --wait --timeout {{.TIMEOUT}} --debug || { + echo "ERROR: Failed to install mlflow chart from {{.CHART_DIR}}/mlflow" + exit 1 + } + + echo "Local Helm installation completed successfully." + - task: forward:port + + # App test task + run:tests:app: + desc: Run application tests against the running MLflow service + cmds: + - echo "Running application tests against MLflow on localhost:{{.PORT}}..." + - | + echo "Installing Python dependencies for tests..." + pip3 install setuptools mlflow==2.11.0 pandas>=2.0.0 scikit-learn>=1.3.0 requests>=2.31.0 urllib3>=2.0.0 + + echo "Running MLflow application tests" + python {{.TESTS_DIR}}/mlflow_test.py localhost:{{.PORT}} \ + --protocol http \ + --connection-timeout 180 \ + --debug + + # All tests task + run:tests:all: + desc: Run all tests + deps: [test:install:helm, run:tests:app] + cmds: + - echo "All tests completed successfully" + + # Version extraction + extract:version:chart: + desc: Extract and print the MLflow chart version + cmds: + - | + echo "{{.MLFLOW_VERSION}}" + silent: true \ No newline at end of file diff --git a/applications/mlflow/charts/mlflow/Chart.lock b/applications/mlflow/charts/mlflow/Chart.lock index 8b5e2d11..62851592 100644 --- a/applications/mlflow/charts/mlflow/Chart.lock +++ b/applications/mlflow/charts/mlflow/Chart.lock @@ -1,6 +1,6 @@ dependencies: - name: replicated repository: oci://registry.replicated.com/library - version: 1.1.0 -digest: sha256:9debf14266b4425bcc68a80da11527f8d0c5e68f678cae676179557bab423ae6 -generated: "2025-02-09T00:40:36.875661-05:00" + version: 1.5.0 +digest: sha256:47a29e041d280e6e5db79c0dcf469b5c43cef2d780169fa7cd40e9b02e9b1fd5 +generated: "2025-04-07T10:50:18.860452-04:00" diff --git a/applications/mlflow/charts/mlflow/values.yaml b/applications/mlflow/charts/mlflow/values.yaml index b63500ae..2e1ac387 100644 --- a/applications/mlflow/charts/mlflow/values.yaml +++ b/applications/mlflow/charts/mlflow/values.yaml @@ -87,20 +87,20 @@ mlflow: # secretKeyRef: # name: extra-env-secret # key: extra-env-key-3 - + # -- Extra initialization containers belonging to the mlflow pod. extraInitContainers: [] - + # -- Extra containers belonging to the mlflow pod. extraContainers: [] - + # -- Extra environment variable sources in mlflow container extraEnvFrom: [] # - configMapRef: # name: extra-env-configmap # - secretRef: # name: extra-env-secret - + # -- Extra volumes that can be mounted by containers belonging to the mlflow pod extraVolumes: [] # - name: mlflow-volume @@ -109,7 +109,7 @@ mlflow: # - name: mlflow-configmap-volume # configMap: # name: mlflow-configmap - + # -- Extra volume mounts to mount into the mlflow container's file system extraVolumeMounts: [] # - name: mlflow-volume @@ -172,7 +172,7 @@ mlflow: # -- Enable/disable the generation of environment variables for services. # [[ref]](https://kubernetes.io/docs/concepts/services-networking/connect-applications-service/#accessing-the-service) enableServiceLinks: true - + # -- Configure the lifecycle for the container lifecycle: {} termination: @@ -200,7 +200,7 @@ mlflow: name: http # -- Annotations to add to the service annotations: {} - + # -- Mlflow Ingress configuration # [[ref]](https://kubernetes.io/docs/concepts/services-networking/ingress/) ingress: @@ -262,7 +262,7 @@ mlflow: # - --expose-prometheus /metrics # If enabled, run the server with debug logging and auto-reload - --dev - + # Basic authentication configuration, # for more information, please visit https://mlflow.org/docs/latest/auth/index.html#configuration basicAuth: @@ -328,7 +328,7 @@ mlflow: replicated: # -- Specifies whetherto enable the Replicated SDK - enabled: false + enabled: true minio: enabled: true @@ -349,7 +349,7 @@ minio: pullPolicy: IfNotPresent # -- Image pull secrets imagePullSecret: {} - # -- + # -- scheduler: {} # -- The Kubernetes secret name that contains MinIO environment variable configurations. # The secret is expected to have a key named config.env containing environment variables exports. @@ -389,7 +389,7 @@ minio: # [[ref]](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity) podAntiAffinityMode: "soft" # -- The `Requests or Limits `__ for resources to associate to Tenant pods. - resources: { } + resources: {} # -- The Kubernetes `SecurityContext `__ to use for deploying Tenant resources. securityContext: runAsUser: 1000 @@ -409,7 +409,7 @@ minio: seccompProfile: type: RuntimeDefault # -- An array of `Topology Spread Constraints `__ to associate to Operator Console pods. - topologySpreadConstraints: [ ] + topologySpreadConstraints: [] # -- The name of a custom `Container Runtime `__ to use for the Operator Console pods. runtimeClassName: "" # -- The mount path where Persistent Volumes are mounted inside Tenant container(s). @@ -428,7 +428,7 @@ minio: externalCaCertSecret: [] # -- Specify an array of Kubernetes secrets, where each entry corresponds to a secret contains the TLS private key and public certificate pair. # See `Operator CRD: TenantSpec `__. - externalCertSecret: [ ] + externalCertSecret: [] # Enable automatic Kubernetes based `certificate generation and signing `__ requestAutoCert: true # -- See `Operator CRD: CertificateConfig `__ @@ -445,25 +445,25 @@ minio: # -- Array of Kubernetes secrets from which the Operator generates MinIO users during tenant provisioning. # Each secret should specify the ``CONSOLE_ACCESS_KEY`` and ``CONSOLE_SECRET_KEY`` as the access key and secret key for that user. users: [] - # -- The `PodManagement `__ policy for MinIO Tenant Pods. + # -- The `PodManagement `__ policy for MinIO Tenant Pods. # Can be "OrderedReady" or "Parallel" podManagementPolicy: Parallel - # -- The `Liveness Probe `__ for monitoring Tenant pod liveness. + # -- The `Liveness Probe `__ for monitoring Tenant pod liveness. # Tenant pods will be restarted if the probe fails. liveness: {} # -- `Readiness Probe `__ for monitoring Tenant container readiness. # Tenant pods will be removed from service endpoints if the probe fails. - # -- `Startup Probe `__ for monitoring container startup. + # -- `Startup Probe `__ for monitoring container startup. # Tenant pods will be restarted if the probe fails. startup: {} # -- The `Lifecycle hooks `__ for container. - lifecycle: { } + lifecycle: {} # -- Directs the Operator to deploy the MinIO S3 API and Console services as LoadBalancer objects. # If the Kubernetes cluster has a configured LoadBalancer, it can attempt to route traffic to those services automatically. # Specify ``minio: true`` to expose the MinIO S3 API. # Specify ``console: true`` to expose the Console. # Both fields default to ``false``. - exposeServices: { } + exposeServices: {} # -- The `Kubernetes Service Account `__ associated with the Tenant. serviceAccountName: "" # -- Directs the Operator to add the Tenant's metric scrape configuration to an existing Kubernetes Prometheus deployment managed by the Prometheus Operator. @@ -472,24 +472,24 @@ minio: # Specify ``json`` for JSON-formatted logs. # Specify ``anonymous`` for anonymized logs. # Specify ``quiet`` to supress logging. - logging: { } + logging: {} # -- serviceMetadata allows passing additional labels and annotations to MinIO and Console specific # services created by the operator. - serviceMetadata: { } + serviceMetadata: {} # -- Add environment variables to be set in MinIO container (https://github.com/minio/minio/tree/master/docs/config) - env: [ ] + env: [] # -- PriorityClassName indicates the Pod priority and hence importance of a Pod relative to other Pods. # This is applied to MinIO pods only. # Refer Kubernetes documentation for details https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#priorityclass/ priorityClassName: "" # -- An array of `Volumes `__ which the Operator can mount to Tenant pods. # The volumes must exist *and* be accessible to the Tenant pods. - additionalVolumes: [ ] + additionalVolumes: [] # -- An array of volume mount points associated to each Tenant container. - additionalVolumeMounts: [ ] + additionalVolumeMounts: [] # Define configuration for KES (stateless and distributed key-management system) # Refer https://github.com/minio/kes - #kes: + # kes: # ## Image field: # # Image from tag (original behavior), for example: # # image: @@ -660,7 +660,7 @@ postgres: # and then blank the password of the postgres user by setting it to NULL. enableSuperuserAccess: true superuserSecret: "" - + # -- This feature enables declarative management of existing roles, as well as the creation of new roles if they are not # already present in the database. # See: https://cloudnative-pg.io/documentation/current/declarative_role_management/ diff --git a/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml b/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml new file mode 100644 index 00000000..2de896cc --- /dev/null +++ b/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml @@ -0,0 +1,17 @@ +# Test values for MLflow CI pipeline +# These values specifically configure the service to use NodePort for testing + +mlflow: + ingress: + enabled: false + # Service configuration for MLflow + service: + # Use NodePort to expose the service on a specific port + type: NodePort + # Service port number (internal) + port: 5000 + # Hardcoded nodePort for consistent access + # Note: Must be between 30000-32767 + nodePort: 30080 + # Service port name + name: http diff --git a/applications/mlflow/tests/mlflow_test.py b/applications/mlflow/tests/mlflow_test.py new file mode 100644 index 00000000..5a64ed0e --- /dev/null +++ b/applications/mlflow/tests/mlflow_test.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 + +import sys +import os +import argparse +import subprocess +import mlflow +from mlflow.models import infer_signature +import requests +import time +import socket +from urllib.parse import urlparse +import logging + +import pandas as pd +from sklearn import datasets +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def check_server_connection(tracking_uri, timeout=30, retry_interval=5): + """ + Check if the MLflow server is reachable + + Args: + tracking_uri: The URI of the MLflow server + timeout: Maximum time in seconds to wait for the server + retry_interval: Interval in seconds between retries + + Returns: + bool: True if the server is reachable, False otherwise + """ + logger.info(f"Checking connection to MLflow server at {tracking_uri}") + + url = tracking_uri + if not url.endswith('/'): + url += '/' + + # Simple health check - just try to access the root URL + health_url = url + + # Parse URL to get host and port for socket check + parsed_url = urlparse(tracking_uri) + host = parsed_url.hostname + port = parsed_url.port or (443 if parsed_url.scheme == 'https' else 80) + + # Authentication credentials + auth = ("admin", "password") + + start_time = time.time() + while time.time() - start_time < timeout: + # First try a basic socket connection + try: + socket.create_connection((host, port), timeout=5) + logger.info(f"Socket connection to {host}:{port} successful") + except (socket.timeout, socket.error, ConnectionRefusedError) as e: + logger.warning(f"Socket connection failed: {e}") + logger.info(f"Retrying in {retry_interval} seconds...") + time.sleep(retry_interval) + continue + + # Then try an HTTP request to the root URL + try: + # For our test environment, always disable SSL verification and include auth + response = requests.get(health_url, timeout=5, verify=False, auth=auth) + status_code = response.status_code + logger.info(f"Server returned status code: {status_code}") + + # 200 OK, 302 Found (redirect), or 401 Unauthorized (at least server is responding) + if status_code in (200, 302, 401): + logger.info(f"MLflow server is reachable at {tracking_uri}") + return True + else: + logger.warning(f"MLflow server returned unexpected status code {status_code}") + except requests.exceptions.RequestException as e: + logger.warning(f"HTTP request failed: {e}") + + logger.info(f"Retrying in {retry_interval} seconds...") + time.sleep(retry_interval) + + logger.error(f"Could not connect to MLflow server at {tracking_uri} after {timeout} seconds") + return False + +def run_mlflow_test(tracking_uri, connection_timeout=60): + """ + Run MLflow test with the specified tracking URI + + Args: + tracking_uri: The URI to use for the MLflow tracking server + connection_timeout: Timeout in seconds for server connection + + Returns: + True if the test passed, False otherwise + """ + try: + logger.info(f"Setting MLflow tracking URI to: {tracking_uri}") + + # Disable SSL warnings for self-signed certificates when using HTTPS + if tracking_uri.startswith('https://'): + import urllib3 + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + # Check if the server is reachable before proceeding + if not check_server_connection(tracking_uri, timeout=connection_timeout): + logger.error("Failed to connect to MLflow server, aborting test") + return False + + # Set MLflow tracking URI with authentication + # Format: http(s)://username:password@hostname:port + parsed_url = urlparse(tracking_uri) + auth_url = f"{parsed_url.scheme}://admin:password@{parsed_url.netloc}{parsed_url.path}" + logger.info(f"Using authenticated tracking URI") + mlflow.set_tracking_uri(auth_url) + + # Load the Iris dataset + logger.info("Loading dataset and training model...") + X, y = datasets.load_iris(return_X_y=True) + + # Split the data into training and test sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # Define the model hyperparameters + params = { + "solver": "lbfgs", + "max_iter": 1000, + "multi_class": "auto", # Deprecated but keeping for now + "random_state": 8888, + } + + # Train the model + lr = LogisticRegression(**params) + lr.fit(X_train, y_train) + + # Predict on the test set + y_pred = lr.predict(X_test) + + # Calculate metrics + accuracy = accuracy_score(y_test, y_pred) + + logger.info(f"Current tracking URI: {mlflow.get_tracking_uri()}") + logger.info(f"Model trained with accuracy: {accuracy:.4f}") + + # Create a new MLflow Experiment + logger.info("Creating MLflow experiment...") + experiment_name = "MLflow CI Test" + try: + experiment = mlflow.get_experiment_by_name(experiment_name) + if experiment is None: + experiment_id = mlflow.create_experiment(experiment_name) + logger.info(f"Created new experiment with ID: {experiment_id}") + else: + logger.info(f"Using existing experiment with ID: {experiment.experiment_id}") + mlflow.set_experiment(experiment_name) + except Exception as e: + logger.error(f"Failed to create or set experiment: {e}") + return False + + # Start an MLflow run + logger.info("Starting MLflow run...") + try: + with mlflow.start_run(): + # Log the hyperparameters + mlflow.log_params(params) + + # Log the loss metric + mlflow.log_metric("accuracy", accuracy) + + # Set a tag that we can use to remind ourselves what this run was for + mlflow.set_tag("Training Info", "CI Test for MLflow") + + # Infer the model signature + signature = infer_signature(X_train, lr.predict(X_train)) + + # Log the model + logger.info("Logging model to MLflow...") + model_info = mlflow.sklearn.log_model( + sk_model=lr, + artifact_path="iris_model", + registered_model_name="ci-test-model", + signature=signature + ) + + logger.info(f"Model URI: {model_info.model_uri}") + + # Load the model back for predictions as a generic Python Function model + try: + logger.info("Loading model for predictions...") + loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) + predictions = loaded_model.predict(X_test[:3]) + logger.info(f"Test predictions: {predictions}") + return True + except Exception as e: + logger.error(f"Error loading model: {e}") + return False + except Exception as e: + logger.error(f"Error during MLflow run: {e}") + return False + + except Exception as e: + logger.error(f"Test failed with error: {e}") + import traceback + logger.error(traceback.format_exc()) + return False + +def ensure_dependencies(): + """Ensure required packages are installed.""" + try: + import mlflow + import pandas + import sklearn + import requests + except ImportError: + logger.info("Installing required dependencies...") + subprocess.check_call([ + sys.executable, "-m", "pip", "install", + "mlflow", "pandas", "scikit-learn", "requests" + ]) + +def main(): + parser = argparse.ArgumentParser(description="MLflow CI testing tool") + parser.add_argument("hostname", help="Hostname of the MLflow server") + parser.add_argument("--port", type=int, help="Port number (if not included in hostname)") + parser.add_argument("--protocol", default="https", help="Protocol (http or https, default: https)") + parser.add_argument("--connection-timeout", type=int, default=60, + help="Timeout in seconds for server connection (default: 60)") + parser.add_argument("--debug", action="store_true", help="Enable debug logs") + + args = parser.parse_args() + + # Set logging level based on debug flag + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + + # Build the tracking URI + tracking_uri = f"{args.protocol}://{args.hostname}" + if args.port: + tracking_uri += f":{args.port}" + + # Show protocol info + if args.protocol == "http": + logger.info("Using HTTP protocol (insecure)") + + # Note about hardcoded credentials + logger.info("Using hardcoded authentication (admin/password)") + + # Ensure dependencies are installed + ensure_dependencies() + + # Run the test + logger.info(f"Starting MLflow test against server: {tracking_uri}") + success = run_mlflow_test(tracking_uri, connection_timeout=args.connection_timeout) + + if success: + logger.info("✅ MLflow test completed successfully") + sys.exit(0) + else: + logger.error("❌ MLflow test failed") + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file From 9dbddbf4b791e8465cb500a5f1a09d9ddff7f15f Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Mon, 14 Apr 2025 10:33:44 -0400 Subject: [PATCH 02/18] add aks and gke to test coverage --- .github/workflows/mlflow-ci.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 78d37cb4..4e82a7dd 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -145,6 +145,10 @@ jobs: cluster: - distribution: kind version: 1.32 + - distribution: aks + version: 1.31 + - distribution: gke + version: 1.32 config: - name: nodeport-ingress-disabled values_file: tests/helm/nodeport-ingress-disabled.yaml @@ -299,6 +303,10 @@ jobs: cluster: - distribution: kind version: 1.32 + - distribution: aks + version: 1.31 + - distribution: gke + version: 1.32 steps: - name: Checkout uses: actions/checkout@v4 From 4cd2741eab38071c897cb69b054b1d97ce73225d Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Mon, 14 Apr 2025 10:43:18 -0400 Subject: [PATCH 03/18] remove instance type from create cluster --- .github/workflows/mlflow-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 4e82a7dd..cf3eca6e 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -221,7 +221,6 @@ jobs: kubernetes-version: ${{ matrix.cluster.version }} cluster-name: mlflow-ci-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }} disk: 100 - instance-type: r1.large ttl: 1h export-kubeconfig: true From 6d9e1e2c20a5f8460a1127da01fa9656cd4905c2 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Mon, 14 Apr 2025 10:54:11 -0400 Subject: [PATCH 04/18] remove instance type from create cluster --- .github/workflows/mlflow-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index cf3eca6e..08579dba 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -337,7 +337,6 @@ jobs: kubernetes-version: ${{ matrix.cluster.version }} cluster-name: mlflow-kots-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }} disk: 100 - instance-type: r1.large ttl: 1h export-kubeconfig: true From c0263fc79607025f11a12c24e37f136ee087f7f8 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Mon, 14 Apr 2025 11:15:03 -0400 Subject: [PATCH 05/18] set nodecount to 3 and use k3s instead of kind --- .github/workflows/mlflow-ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 08579dba..f537e362 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -143,7 +143,7 @@ jobs: fail-fast: false matrix: cluster: - - distribution: kind + - distribution: k3s version: 1.32 - distribution: aks version: 1.31 @@ -221,6 +221,7 @@ jobs: kubernetes-version: ${{ matrix.cluster.version }} cluster-name: mlflow-ci-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }} disk: 100 + nodes: 3 ttl: 1h export-kubeconfig: true @@ -300,7 +301,7 @@ jobs: fail-fast: false matrix: cluster: - - distribution: kind + - distribution: k3s version: 1.32 - distribution: aks version: 1.31 @@ -337,6 +338,7 @@ jobs: kubernetes-version: ${{ matrix.cluster.version }} cluster-name: mlflow-kots-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }} disk: 100 + nodes: 3 ttl: 1h export-kubeconfig: true From 87ff645e3a88cd41d6c63d83bcc92154f29fd50b Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Mon, 14 Apr 2025 11:50:49 -0400 Subject: [PATCH 06/18] remove aks --- .github/workflows/mlflow-ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index f537e362..7f94c44a 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -145,8 +145,6 @@ jobs: cluster: - distribution: k3s version: 1.32 - - distribution: aks - version: 1.31 - distribution: gke version: 1.32 config: @@ -303,8 +301,6 @@ jobs: cluster: - distribution: k3s version: 1.32 - - distribution: aks - version: 1.31 - distribution: gke version: 1.32 steps: From 5958c4ea26f017c28e3e13840f4e41205728d508 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Mon, 14 Apr 2025 12:25:12 -0400 Subject: [PATCH 07/18] improve docs --- applications/mlflow/DEVELOPMENT.md | 215 +++++++++ applications/mlflow/README.md | 258 +++++----- applications/mlflow/charts/mlflow/README.md | 441 ++++++++++++++++++ .../mlflow/charts/mlflow/README.md.gotmpl | 204 ++++++++ .../charts/mlflow/README_CHANGELOG.md.gotmpl | 46 ++ .../charts/mlflow/README_CONFIG.md.gotmpl | 139 ++++++ 6 files changed, 1154 insertions(+), 149 deletions(-) create mode 100644 applications/mlflow/DEVELOPMENT.md create mode 100644 applications/mlflow/charts/mlflow/README.md.gotmpl create mode 100644 applications/mlflow/charts/mlflow/README_CHANGELOG.md.gotmpl create mode 100644 applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl diff --git a/applications/mlflow/DEVELOPMENT.md b/applications/mlflow/DEVELOPMENT.md new file mode 100644 index 00000000..6b316ecd --- /dev/null +++ b/applications/mlflow/DEVELOPMENT.md @@ -0,0 +1,215 @@ +# MLflow Development Guide + +This document provides information about developing, testing, and releasing the MLflow application using the included Taskfile. + +## Development Workflow + +The MLflow application includes a Taskfile.yml that provides tasks for developing, testing, and publishing the application. + +### Prerequisites + +- [Task](https://taskfile.dev/#/installation) command line tool +- Kubernetes cluster configured in your current context +- kubectl, helm, and python3 installed + +### Local Development + +Follow this workflow for development: + +1. Add required Helm repositories and update dependencies: + ```bash + task add:repos:helm + task update:deps:helm + ``` + +2. Lint charts to check for issues: + ```bash + task lint + ``` + +3. Template charts to verify the rendered manifests: + ```bash + task template + ``` + +4. Install charts for development: + ```bash + # Installs with Replicated SDK disabled + task install:helm:local + + # Optionally specify a custom values file + MLFLOW_VALUES=./my-values.yaml task install:helm:local + ``` + + > **Note:** For local development, the Replicated SDK is explicitly disabled (`replicated.enabled=false`). This allows development without requiring access to the Replicated platform. + > + > This task automatically sets up port forwarding from localhost:5000 to the MLflow service in the cluster, making the application available for testing. + > + > The Helm releases are created with names `infra` and `mlflow` in the `mlflow` namespace. + +5. Run application tests: + ```bash + task run:tests:app + ``` + +6. Make changes to your charts and repeat steps 2-5 as needed + +This workflow allows rapid iteration without needing to publish to the Replicated registry. + +### Task Reference + +Tasks follow a `verb:resource[:subresource]` naming convention for clarity: + +```bash +# Validation and verification +task lint # Lint Helm charts +task template # Render templates to stdout (SDK disabled) +task check:versions # Verify Chart.yaml and KOTS manifest versions match + +# Repository and dependency management +task add:repos:helm # Add required Helm repositories +task update:deps:helm # Update Helm chart dependencies + +# Packaging and versioning +task update:versions:chart # Update chart version refs in KOTS manifests +task package:charts # Package Helm charts for distribution +task extract:version:chart # Extract current MLflow chart version + +# Installation +task install:helm:local # Install charts for local development (SDK disabled) + +# Testing +task test:install:helm # Test with charts from Replicated registry +task test:install:kots # Test KOTS installation +task run:tests:app # Run application tests against running MLflow +task run:tests:all # Run all tests (Helm install + app tests) + +# Release management +task create:release # Create a Replicated release + +# Cleanup +task clean:files:charts # Clean packaged chart files +task clean:all # Clean all generated files +``` + +## Releasing + +### Updating Documentation + +Before creating a release, ensure the documentation is up-to-date: + +1. Update version information in `charts/mlflow/Chart.yaml` if needed. + +2. Update the changelog in `charts/mlflow/README_CHANGELOG.md.gotmpl` with details about the new release. + +3. Generate documentation using helm-docs: + ```bash + # From the mlflow chart directory + cd charts/mlflow + + # If helm-docs is installed locally + helm-docs + + # Or use Docker + docker run --rm -v "$(pwd):/helm-docs" -u $(id -u) jnorwood/helm-docs:latest + ``` + +4. Verify the generated documentation: + - `README.md` - Main chart documentation + - `README_CHANGELOG.md` - Changelog + - `README_CONFIG.md` - Configuration reference + +### Publishing Replicated Releases + +When you're ready to publish your changes to the Replicated platform: + +1. Update the version in `charts/mlflow/Chart.yaml` if necessary. + +2. Update documentation: + ```bash + # If helm-docs is not installed + cd charts/mlflow + docker run --rm -v "$(pwd):/helm-docs" -u $(id -u) jnorwood/helm-docs:latest + ``` + +3. Set up the required environment variables: + ```bash + # Replicated API token for authentication + export REPLICATED_API_TOKEN=your_api_token + + # App and channel to publish to + export REPLICATED_APP=app_slug + export REPLICATED_CHANNEL=channel_name + ``` + +4. Package the charts and update version references: + ```bash + # This updates KOTS manifests with the current chart versions + # and packages the charts as .tgz files + task package:charts + ``` + +5. Create a release in Replicated: + ```bash + # This uploads the packaged charts and creates a new release + task create:release + ``` + +6. Verify the release was created successfully in the Replicated vendor portal + +### Testing Replicated Releases + +This workflow tests the full Replicated release and distribution process: + +1. After publishing a release, login to the registry with a license ID: + ```bash + # Set license ID for registry authentication + export REPLICATED_LICENSE_ID=your_license_id + export REPLICATED_APP=app_slug + export REPLICATED_CHANNEL=channel_name + + # Login to the registry + task login:registry + ``` + +2. Test the Helm installation from the Replicated registry: + ```bash + # This pulls charts from the Replicated registry with SDK enabled + task test:install:helm + ``` + + > **Note:** This creates Helm releases named `infra` and `mlflow` in the `mlflow` namespace. + +3. Verify the installation with application tests: + ```bash + task run:tests:app + ``` + +You can also run the complete test suite after setting up environment variables: +```bash +task run:tests:all +``` + +This workflow validates the entire release pipeline from publishing to installation, ensuring that your charts work correctly when distributed through the Replicated platform. + +## CI/CD Pipeline + +This application includes a CI/CD pipeline implemented with GitHub Actions. The pipeline handles: + +- Linting and validating Helm chart templates +- Creating releases in Replicated +- Testing Helm installation with charts from the Replicated registry +- Installing the application via KOTS + +The pipeline workflow: +1. `lint-and-template`: Validates chart syntax and templates (SDK disabled) +2. `create-release`: Packages charts and creates a release in Replicated +3. `helm-install-test`: Tests Helm installation with charts from Replicated registry (SDK enabled) +4. `kots-install-test`: Tests KOTS installation +5. `cleanup-test-release`: Cleans up test resources + +The pipeline is triggered on: +- Pull requests affecting the MLflow application +- Pushes to the main branch + +For more details, see the workflow definition in [.github/workflows/mlflow-ci.yml](../../.github/workflows/mlflow-ci.yml). \ No newline at end of file diff --git a/applications/mlflow/README.md b/applications/mlflow/README.md index 4863b9b7..d14e8b78 100644 --- a/applications/mlflow/README.md +++ b/applications/mlflow/README.md @@ -1,170 +1,130 @@ -# MLflow +# MLflow on Replicated -MLflow is an open-source platform for managing the machine learning lifecycle, including experimentation, reproducibility, deployment, and a central model registry. This application provides a Helm-based deployment of MLflow with support for Replicated distribution. +MLflow is an open-source platform for managing the machine learning lifecycle, including experimentation, reproducibility, deployment, and a central model registry. This solution provides MLflow deployment with Replicated, supporting multiple installation methods to fit your environment needs. -## Development +## Overview -The MLflow application includes a Taskfile.yml that provides tasks for developing, testing, and publishing the application. +This Replicated application offers MLflow with: -### Prerequisites +- Comprehensive tracking and model registry capabilities +- PostgreSQL backend for metadata storage +- MinIO for artifact storage +- Multiple deployment options for flexibility + +## Deployment Options + +### Helm Chart + +For customers who prefer direct Helm installation: +- Standard Helm chart interface +- Integration with existing CI/CD pipelines +- Full configurability via values + +```bash +# Add Replicated registry with your license ID +helm registry login registry.replicated.com --username= + +# Install via Helm +helm install mlflow oci://registry.replicated.com/mlflow/stable +``` + +### Embedded Cluster + +For customers without an existing Kubernetes cluster, the embedded option provides: +- Integrated Kubernetes cluster managed by Replicated +- Simple installation on VMs or bare metal +- No Kubernetes expertise required +- Optimized resource usage + +```bash +# Download installer from the provided license URL +# Run the installer script +bash ./install.sh +``` + +### KOTS Existing Cluster + +For customers with existing Kubernetes clusters, the KOTS installation method provides: +- Admin console for application management +- Version updates with rollback capability +- Configuration validation +- Pre-flight checks to verify environment requirements -- [Task](https://taskfile.dev/#/installation) command line tool -- Kubernetes cluster configured in your current context -- kubectl, helm, and python3 installed +```bash +# Install KOTS CLI +curl https://kots.io/install | bash + +# Install MLflow with KOTS +kubectl kots install mlflow/stable +``` + +## Documentation -### Development Workflow +- [MLflow Helm Chart Documentation](./charts/mlflow/README.md) - Installation and configuration details +- [Configuration Reference](./charts/mlflow/README_CONFIG.md) - Detailed configuration options -Follow this workflow for development: +## For Developers -1. Add required Helm repositories and update dependencies: - ```bash - task add:repos:helm - task update:deps:helm - ``` +If you're looking to contribute to or customize this application, please refer to our comprehensive [Development Guide](./DEVELOPMENT.md). The development guide covers: -2. Lint charts to check for issues: - ```bash - task lint - ``` +- Development workflow with Taskfile +- Local testing instructions +- Release process +- CI/CD integration +- Helm chart customization -3. Template charts to verify the rendered manifests: - ```bash - task template - ``` +We use [helm-docs](https://github.com/norwoodj/helm-docs) for chart documentation. See the [Development Guide](./DEVELOPMENT.md) for details. -4. Install charts for development: - ```bash - # Installs with Replicated SDK disabled - task install:helm:local - - # Optionally specify a custom values file - MLFLOW_VALUES=./my-values.yaml task install:helm:local - ``` +## MLflow Features - > **Note:** For local development, the Replicated SDK is explicitly disabled (`replicated.enabled=false`). This allows development without requiring access to the Replicated platform. - > - > This task automatically sets up port forwarding from localhost:5000 to the MLflow service in the cluster, making the application available for testing. +This Replicated distribution includes the following MLflow features: -5. Run application tests: - ```bash - task run:tests:app - ``` +- **Experiment Tracking**: Record parameters, metrics, code versions, and artifacts +- **Model Registry**: Store, annotate, and manage model versions in a central repository +- **Model Serving**: Deploy models for inference with version control +- **Project Management**: Package data science code for reproducibility -6. Make changes to your charts and repeat steps 2-5 as needed +## Architecture -This workflow allows rapid iteration without needing to publish to the Replicated registry. +The solution architecture consists of: + +- **MLflow Server**: Core MLflow tracking and registry services +- **PostgreSQL**: Metadata storage for experiments, runs, and models +- **MinIO**: S3-compatible storage for artifacts and model files +- **Replicated Integration**: Management layer for installation and updates + +## Getting Started + +### Prerequisites -### Task Reference +- For KOTS: Kubernetes cluster v1.19+ or admin access to install embedded cluster +- For Helm: Helm v3.0+ and a Kubernetes cluster +- Valid Replicated license -Tasks follow a `verb:resource[:subresource]` naming convention for clarity: +### Quick Start for Development + +For local development with the Helm charts: ```bash -# Validation and verification -task lint # Lint Helm charts -task template # Render templates to stdout (SDK disabled) -task check:versions # Verify Chart.yaml and KOTS manifest versions match - -# Repository and dependency management -task add:repos:helm # Add required Helm repositories -task update:deps:helm # Update Helm chart dependencies - -# Packaging and versioning -task update:versions:chart # Update chart version refs in KOTS manifests -task package:charts # Package Helm charts for distribution -task extract:version:chart # Extract current MLflow chart version - -# Installation -task install:helm:local # Install charts for local development (SDK disabled) - -# Testing -task test:install:helm # Test with charts from Replicated registry -task test:install:kots # Test KOTS installation -task run:tests:app # Run application tests against running MLflow -task run:tests:all # Run all tests (Helm install + app tests) - -# Release management -task create:release # Create a Replicated release - -# Cleanup -task clean:files:charts # Clean packaged chart files -task clean:all # Clean all generated files +# Clone this repository +git clone https://github.com/replicatedhq/platform-examples.git +cd platform-examples/applications/mlflow + +# Install Task CLI (if not already installed) +# See https://taskfile.dev/#/installation + +# Add required Helm repositories and update dependencies +task add:repos:helm +task update:deps:helm + +# Install charts locally with Replicated SDK disabled +task install:helm:local + +# Access MLflow UI at http://localhost:5000 ``` -### Publishing Replicated Releases - -When you're ready to publish your changes to the Replicated platform: - -1. Set up the required environment variables: - ```bash - # Replicated API token for authentication - export REPLICATED_API_TOKEN=your_api_token - - # App and channel to publish to - export REPLICATED_APP=app_slug - export REPLICATED_CHANNEL=channel_name - ``` - -2. Package the charts and update version references: - ```bash - # This updates KOTS manifests with the current chart versions - # and packages the charts as .tgz files - task package:charts - ``` - -3. Create a release in Replicated: - ```bash - # This uploads the packaged charts and creates a new release - task create:release - ``` - -4. Verify the release was created successfully in the Replicated vendor portal - -### Testing Replicated Releases - -This workflow tests the full Replicated release and distribution process: - -1. After publishing a release, login to the registry with a license ID: - ```bash - # Set license ID for registry authentication - export REPLICATED_LICENSE_ID=your_license_id - export REPLICATED_APP=app_slug - export REPLICATED_CHANNEL=channel_name - - # Login to the registry - task login:registry - ``` - -2. Test the Helm installation from the Replicated registry: - ```bash - # This pulls charts from the Replicated registry with SDK enabled - task test:install:helm - ``` - -3. Verify the installation with application tests: - ```bash - task run:tests:app - ``` - -This workflow validates the entire release pipeline from publishing to installation, ensuring that your charts work correctly when distributed through the Replicated platform. - -## CI/CD Pipeline - -This application includes a CI/CD pipeline implemented with GitHub Actions. The pipeline handles: - -- Linting and validating Helm chart templates -- Creating releases in Replicated -- Testing Helm installation with charts from the Replicated registry -- Installing the application via KOTS - -The pipeline workflow: -1. `lint-and-template`: Validates chart syntax and templates (SDK disabled) -2. `create-release`: Packages charts and creates a release in Replicated -3. `helm-install-test`: Tests Helm installation with charts from Replicated registry (SDK enabled) -4. `kots-install-test`: Tests KOTS installation -5. `cleanup-test-release`: Cleans up test resources - -The pipeline is triggered on: -- Pull requests affecting the MLflow application -- Pushes to the main branch - -For more details, see the workflow definition in [.github/workflows/mlflow-ci.yml](../../.github/workflows/mlflow-ci.yml). +For more details on using the Taskfile for development and releasing, see the [Development Guide](./DEVELOPMENT.md). + +## Support + +For support with this application, please visit the [Replicated Community](https://community.replicated.com/). diff --git a/applications/mlflow/charts/mlflow/README.md b/applications/mlflow/charts/mlflow/README.md index d6784a01..3e483822 100644 --- a/applications/mlflow/charts/mlflow/README.md +++ b/applications/mlflow/charts/mlflow/README.md @@ -1 +1,442 @@ # mlflow + +A Helm chart for MLflow - Open source platform for the machine learning lifecycle. + +![Version: 0.4.0](https://img.shields.io/badge/Version-0.4.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.10.0](https://img.shields.io/badge/AppVersion-2.10.0-informational?style=flat-square) + +## Introduction + +MLflow is an open-source platform for managing the machine learning lifecycle, including: +- Experiment tracking: Record and compare parameters, data, code, and results +- Model registry: Store, annotate, discover, and manage models in a central repository +- Model serving: Deploy models in diverse serving environments + +This Helm chart deploys MLflow with a PostgreSQL database for tracking and MinIO for artifact storage. + +## Source Code + +* + +## Requirements + +## Dependencies + +| Repository | Name | Version | +|------------|------|---------| +| oci://registry.replicated.com/library | replicated | ^1.1.0 | + +## Installing the Chart + +### Prerequisites +- Kubernetes cluster running version 1.19+ +- Helm 3.0+ +- Persistent storage provisioner (for PostgreSQL and MinIO) + +### Quick Start + +```bash +# Add the Replicated registry (if using Replicated) +helm registry login registry.replicated.com --username= + +# Install the chart +helm install mlflow oci://registry.replicated.com/your-app/your-channel/mlflow +``` + +### From Local Chart + +```bash +# Clone the repository +git clone https://github.com/replicatedhq/platform-examples.git +cd platform-examples/applications/mlflow + +# Install dependencies +helm dependency update ./charts/mlflow + +# Install the chart +helm install mlflow ./charts/mlflow --namespace mlflow --create-namespace +``` + +## Usage + +### Accessing MLflow + +After deploying MLflow, you can access the web UI by port-forwarding the service: + +```bash +kubectl port-forward -n mlflow svc/mlflow 5000:5000 +``` + +Then navigate to http://localhost:5000 in your browser. + +## Features + +- **Tracking Server**: Central interface for logging parameters, metrics, and artifacts +- **Model Registry**: Repository for managing the full lifecycle of MLflow Models +- **PostgreSQL**: Persistent storage for experiment and run data +- **MinIO**: S3-compatible storage for model artifacts +- **Replicated Integration**: Support for distribution through the Replicated platform + +## Configuration + +The following table lists the configurable parameters for the MLflow chart and their default values. + +For detailed configuration options, see the [Configuration Reference](./README_CONFIG.md). + +### Basic Configuration + +#### Minimum Configuration + +```yaml +# Minimal configuration example +postgresql: + auth: + password: "securePassword" # Required for security +minio: + auth: + rootPassword: "securePassword" # Required for security +``` + +#### Common Configuration Options + +```yaml +# Common options +mlflow: + # Set resources for MLflow server + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + + # Configure basic authentication + auth: + enabled: true + username: admin + password: password +``` + +For complete configuration options including external services, security settings, and advanced features, see the [Configuration Reference](./README_CONFIG.md). + +## Uninstalling the Chart + +```bash +helm uninstall mlflow -n mlflow +``` + +## Changelog + +The changelog for this chart is maintained in [README_CHANGELOG.md](./README_CHANGELOG.md). + +## Support + +For support with this chart, please visit the [Replicated Community](https://community.replicated.com/). + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)# Configuration + +This document outlines the configuration options for the MLflow Helm chart. + +The following table lists the configurable parameters for the MLflow chart and their default values. + +## Advanced Configuration + +### PostgreSQL Configuration + +The chart uses PostgreSQL for storing MLflow metadata. You can configure the database connection using: + +```yaml +postgresql: + enabled: true + auth: + username: mlflow + password: mlflowpassword + database: mlflow + primary: + persistence: + size: 10Gi +``` + +### MinIO Configuration + +MinIO is used for artifact storage. Configure it with: + +```yaml +minio: + enabled: true + auth: + rootUser: minioadmin + rootPassword: minioadmin + persistence: + size: 20Gi + defaultBuckets: "mlflow" +``` + +### Using External Storage + +To use external PostgreSQL: + +```yaml +postgresql: + enabled: false + +mlflow: + backendStore: + databaseUri: "postgresql://user:password@external-postgresql:5432/mlflow" +``` + +To use external S3-compatible storage: + +```yaml +minio: + enabled: false + +mlflow: + artifactRoot: + s3: + enabled: true + bucket: "mlflow" + endpoint: "s3.amazonaws.com" + accessKey: "your-access-key" + secretKey: "your-secret-key" + region: "us-east-1" +``` + +### Replicated SDK Integration + +Enable or disable the Replicated SDK integration: + +```yaml +replicated: + enabled: true +``` + +For development environments, you'll typically want to disable this: + +```yaml +replicated: + enabled: false +``` + +### Security Considerations + +By default, this chart doesn't include authentication. In production, consider: + +1. Using an ingress with authentication +2. Setting up TLS encryption +3. Configuring username/password protection + +Example ingress configuration with TLS: + +```yaml +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: nginx + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - host: mlflow.example.com + paths: + - path: / + pathType: Prefix + tls: + - secretName: mlflow-tls + hosts: + - mlflow.example.com +``` + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| fullnameOverride | string | `"mlflow"` | String to override the default generated fullname | +| minio.enabled | bool | `true` | | +| minio.secrets.accessKey | string | `"minio"` | | +| minio.secrets.name | string | `"myminio-env-configuration"` | | +| minio.secrets.secretKey | string | `"minio1234"` | | +| minio.tenant.additionalVolumeMounts | list | `[]` | An array of volume mount points associated to each Tenant container. | +| minio.tenant.additionalVolumes | list | `[]` | An array of `Volumes `__ which the Operator can mount to Tenant pods. The volumes must exist *and* be accessible to the Tenant pods. | +| minio.tenant.buckets | list | `[{"name":"mlflow"}]` | Array of objects describing one or more buckets to create during tenant provisioning. | +| minio.tenant.certificate | object | `{"certConfig":{},"externalCaCertSecret":[],"externalCertSecret":[],"requestAutoCert":true}` | Configures external certificate settings for the Tenant. | +| minio.tenant.certificate.certConfig | object | `{}` | See `Operator CRD: CertificateConfig `__ | +| minio.tenant.certificate.externalCaCertSecret | list | `[]` | Specify an array of Kubernetes TLS secrets, where each entry corresponds to a secret the TLS private key and public certificate pair. See `Operator CRD: TenantSpec `__. | +| minio.tenant.certificate.externalCertSecret | list | `[]` | Specify an array of Kubernetes secrets, where each entry corresponds to a secret contains the TLS private key and public certificate pair. See `Operator CRD: TenantSpec `__. | +| minio.tenant.configuration | object | `{"name":"myminio-env-configuration"}` | The Kubernetes secret name that contains MinIO environment variable configurations. The secret is expected to have a key named config.env containing environment variables exports. | +| minio.tenant.env | list | `[]` | Add environment variables to be set in MinIO container (https://github.com/minio/minio/tree/master/docs/config) | +| minio.tenant.exposeServices | object | `{}` | Directs the Operator to deploy the MinIO S3 API and Console services as LoadBalancer objects. If the Kubernetes cluster has a configured LoadBalancer, it can attempt to route traffic to those services automatically. Specify ``minio: true`` to expose the MinIO S3 API. Specify ``console: true`` to expose the Console. Both fields default to ``false``. | +| minio.tenant.features | object | `{"bucketDNS":false,"domains":{},"enableSFTP":false}` | MinIO features to enable or disable in the MinIO Tenant See `Operator CRD: Features `__. | +| minio.tenant.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | +| minio.tenant.image.repository | string | `"quay.io/minio/minio"` | Image repository | +| minio.tenant.image.tag | string | `"RELEASE.2024-05-01T01-11-10Z"` | Image tag | +| minio.tenant.imagePullSecret | object | `{}` | Image pull secrets | +| minio.tenant.lifecycle | object | `{}` | The `Lifecycle hooks `__ for container. | +| minio.tenant.liveness | object | `{}` | The `Liveness Probe `__ for monitoring Tenant pod liveness. Tenant pods will be restarted if the probe fails. | +| minio.tenant.logging | object | `{}` | Configure pod logging configuration for the MinIO Tenant. Specify ``json`` for JSON-formatted logs. Specify ``anonymous`` for anonymized logs. Specify ``quiet`` to supress logging. | +| minio.tenant.metrics | object | `{"enabled":false,"port":9000,"protocol":"http"}` | Configures a Prometheus-compatible scraping endpoint at the specified port. | +| minio.tenant.mountPath | string | `"/export"` | The mount path where Persistent Volumes are mounted inside Tenant container(s). | +| minio.tenant.name | string | `"minio"` | Minio Tenant name | +| minio.tenant.podManagementPolicy | string | `"Parallel"` | The `PodManagement `__ policy for MinIO Tenant Pods. Can be "OrderedReady" or "Parallel" | +| minio.tenant.pools | object | `{"pool0":{"affinity":{},"annotations":{},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"labels":{},"name":"pool-0","nodeSelector":{},"podAntiAffinityMode":"soft","podAntiAffinityTopologyKey":"","resources":{},"runtimeClassName":"","securityContext":{"fsGroup":1000,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000},"servers":3,"size":"10Gi","storageAnnotations":{},"tolerations":[],"topologySpreadConstraints":[],"volumesPerServer":4}}` | See `Operator CRD: Pools `__ for more information on all subfields. | +| minio.tenant.pools.pool0 | object | `{"affinity":{},"annotations":{},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"labels":{},"name":"pool-0","nodeSelector":{},"podAntiAffinityMode":"soft","podAntiAffinityTopologyKey":"","resources":{},"runtimeClassName":"","securityContext":{"fsGroup":1000,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000},"servers":3,"size":"10Gi","storageAnnotations":{},"tolerations":[],"topologySpreadConstraints":[],"volumesPerServer":4}` | The number of MinIO Tenant Pods / Servers in this pool. For standalone mode, supply 1. For distributed mode, supply 4 or more. Note that the operator does not support upgrading from standalone to distributed mode. | +| minio.tenant.pools.pool0.affinity | object | `{}` | Affinity/Anti-affinity rules for Pods. See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-AffinityConfiguration | +| minio.tenant.pools.pool0.annotations | object | `{}` | Specify `annotations `__ to associate to Tenant pods. | +| minio.tenant.pools.pool0.containerSecurityContext | object | `{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}}` | The Kubernetes `SecurityContext `__ to use for deploying Tenant containers. | +| minio.tenant.pools.pool0.labels | object | `{}` | Specify `labels `__ to associate to Tenant pods. | +| minio.tenant.pools.pool0.name | string | `"pool-0"` | Custom name for the pool | +| minio.tenant.pools.pool0.nodeSelector | object | `{}` | Any `Node Selectors `__ to apply to Tenant pods. | +| minio.tenant.pools.pool0.podAntiAffinityMode | string | `"soft"` | Specifies whether podAntiAffinity should be "required" or simply "preferred" This determines if requiredDuringSchedulingIgnoredDuringExecution or preferredDuringSchedulingIgnoredDuringExecution is used [[ref]](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity) | +| minio.tenant.pools.pool0.podAntiAffinityTopologyKey | string | `""` | Enables podAntiAffinity with the specified topology key .minio.tenant.pool.pool0.affinity takes precedence over this setting | +| minio.tenant.pools.pool0.resources | object | `{}` | The `Requests or Limits `__ for resources to associate to Tenant pods. | +| minio.tenant.pools.pool0.runtimeClassName | string | `""` | The name of a custom `Container Runtime `__ to use for the Operator Console pods. | +| minio.tenant.pools.pool0.securityContext | object | `{"fsGroup":1000,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000}` | The Kubernetes `SecurityContext `__ to use for deploying Tenant resources. | +| minio.tenant.pools.pool0.size | string | `"10Gi"` | The capacity per volume requested per MinIO Tenant Pod. | +| minio.tenant.pools.pool0.storageAnnotations | object | `{}` | Specify `storageAnnotations `__ to associate to PVCs. | +| minio.tenant.pools.pool0.tolerations | list | `[]` | An array of `Toleration labels `__ to associate to Tenant pods. | +| minio.tenant.pools.pool0.topologySpreadConstraints | list | `[]` | An array of `Topology Spread Constraints `__ to associate to Operator Console pods. | +| minio.tenant.pools.pool0.volumesPerServer | int | `4` | The number of volumes attached per MinIO Tenant Pod / Server. | +| minio.tenant.priorityClassName | string | `""` | PriorityClassName indicates the Pod priority and hence importance of a Pod relative to other Pods. This is applied to MinIO pods only. Refer Kubernetes documentation for details https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#priorityclass/ | +| minio.tenant.prometheusOperator | bool | `false` | Directs the Operator to add the Tenant's metric scrape configuration to an existing Kubernetes Prometheus deployment managed by the Prometheus Operator. | +| minio.tenant.scheduler | object | `{}` | | +| minio.tenant.serviceAccountName | string | `""` | The `Kubernetes Service Account `__ associated with the Tenant. | +| minio.tenant.serviceMetadata | object | `{}` | serviceMetadata allows passing additional labels and annotations to MinIO and Console specific services created by the operator. | +| minio.tenant.startup | object | `{}` | `Startup Probe `__ for monitoring container startup. Tenant pods will be restarted if the probe fails. | +| minio.tenant.subPath | string | `"/data"` | The Sub path inside Mount path where MinIO stores data. | +| minio.tenant.users | list | `[]` | Array of Kubernetes secrets from which the Operator generates MinIO users during tenant provisioning. Each secret should specify the ``CONSOLE_ACCESS_KEY`` and ``CONSOLE_SECRET_KEY`` as the access key and secret key for that user. | +| mlflow.affinity | object | `{}` | Defines affinity constraint rules. [[ref]](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity) | +| mlflow.annotations | object | `{}` | Annotations to add to the mlflow deployment | +| mlflow.artifactStore.s3.accessKeyId | string | `"minio"` | AWS access key ID Used if S3 is enabled as an artifact storage backend and no existing secret is specified | +| mlflow.artifactStore.s3.createCaSecret | object | `{"caBundle":""}` | If S3 is enabled as artifact store backend and no existing CA secret is specified, create the secret used to secure connection to S3 / Minio | +| mlflow.artifactStore.s3.createCaSecret.caBundle | string | `""` | Content of CA bundle | +| mlflow.artifactStore.s3.enabled | bool | `true` | Specifies whether to enable AWS S3 as artifact store backend NOTE: Need to also ensure .mlflow.trackingServer.artifactsDestination is set to the correct S3 bucket | +| mlflow.artifactStore.s3.existingCaSecret | string | `""` | Name of an existing secret containing the key `ca-bundle.crt` used to store the CA certificate for TLS connections | +| mlflow.artifactStore.s3.existingSecret | string | `""` | Name of an existing secret containing the keys `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` to access artifact storage on AWS S3 or MINIO | +| mlflow.artifactStore.s3.external | object | `{"enabled":false,"host":"","ignoreTls":false,"port":443,"protocol":"https"}` | External S3 compatible bucket details Used if S3 is enabled as artifact storage backend, external.enabled is true, and no existing secret is specified | +| mlflow.artifactStore.s3.external.enabled | bool | `false` | Specifies whether to use an external S3 bucket | +| mlflow.artifactStore.s3.external.host | string | `""` | S3 endpoint host | +| mlflow.artifactStore.s3.external.ignoreTls | bool | `false` | Specify whether to ignore TLS | +| mlflow.artifactStore.s3.external.port | int | `443` | S3 endpoint port | +| mlflow.artifactStore.s3.external.protocol | string | `"https"` | S3 endpoint protocol https or http | +| mlflow.artifactStore.s3.ignoreTls | bool | `true` | Specify whether to ignore TLS | +| mlflow.artifactStore.s3.secretAccessKey | string | `"minio1234"` | AWS secret access key Used if S3 is enabled as an artifact storage backend and no existing secret is specified | +| mlflow.automountServiceAccountToken | bool | `true` | Specifies whether to automount service account token | +| mlflow.backendStore.databaseUpgrade | bool | `false` | Specifies whether to run `mlflow db upgrade ${MLFLOW_BACKEND_STORE_URI}` to upgrade database schema when use a database as backend store | +| mlflow.backendStore.existingSecret | string | `""` | Name of an existing secret which contains key `MLFLOW_BACKEND_STORE_URI` If an existing secret is not provided, a new secret will be created to store the backend store URI using the details from .Values.postgres when Embedded PostgreSQL is enabled | +| mlflow.containerSecurityContext | object | `{}` | Configure the Security Context for the Container | +| mlflow.dnsConfig | object | `{}` | Optional DNS settings, configuring the ndots option may resolve nslookup issues on some Kubernetes setups. | +| mlflow.dnsPolicy | string | `""` | Defaults to "ClusterFirst" if hostNetwork is false and "ClusterFirstWithHostNet" if hostNetwork is true | +| mlflow.enableServiceLinks | bool | `true` | Enable/disable the generation of environment variables for services. [[ref]](https://kubernetes.io/docs/concepts/services-networking/connect-applications-service/#accessing-the-service) | +| mlflow.env | object | `{"configMap":{},"container":[],"secret":{}}` | Extra environment variables in mlflow container | +| mlflow.extraContainers | list | `[]` | Extra containers belonging to the mlflow pod. | +| mlflow.extraEnvFrom | list | `[]` | Extra environment variable sources in mlflow container | +| mlflow.extraInitContainers | list | `[]` | Extra initialization containers belonging to the mlflow pod. | +| mlflow.extraVolumeMounts | list | `[]` | Extra volume mounts to mount into the mlflow container's file system | +| mlflow.extraVolumes | list | `[]` | Extra volumes that can be mounted by containers belonging to the mlflow pod | +| mlflow.hostAliases | list | `[]` | Use hostAliases to add custom entries to /etc/hosts - mapping IP addresses to hostnames. [[ref]](https://kubernetes.io/docs/concepts/services-networking/add-entries-to-pod-etc-hosts-with-host-aliases/) | +| mlflow.hostNetwork | bool | `false` | When using hostNetwork make sure you set dnsPolicy to `ClusterFirstWithHostNet` | +| mlflow.hostname | string | `""` | Allows specifying explicit hostname setting | +| mlflow.image | object | `{"pullPolicy":"IfNotPresent","registry":"docker.io","repository":"bitnami/mlflow","tag":"2.12.2-debian-12-r1"}` | Image configuration for the mlflow deployment | +| mlflow.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy | +| mlflow.image.registry | string | `"docker.io"` | Image registry | +| mlflow.image.repository | string | `"bitnami/mlflow"` | Image repository | +| mlflow.image.tag | string | `"2.12.2-debian-12-r1"` | Image tag | +| mlflow.imagePullSecets | list | `[]` | Image pull secrets | +| mlflow.ingress | object | `{"annotations":{},"className":"nginx","enabled":false,"extraHosts":[],"extraPaths":[],"extraRules":[],"extraTls":[],"hostname":"chart-example.local","path":"/","pathType":"ImplementationSpecific","tls":{"cert":"-----BEGIN CERTIFICATE-----\n-----END CERTIFICATE-----\n","enabled":false,"genSelfSignedCert":false,"key":"-----BEGIN PRIVATE KEY-----\n-----END PRIVATE KEY-----\n"}}` | Mlflow Ingress configuration [[ref]](https://kubernetes.io/docs/concepts/services-networking/ingress/) | +| mlflow.ingress.annotations | object | `{}` | Annotations to add to the ingress | +| mlflow.ingress.className | string | `"nginx"` | Ingress class name | +| mlflow.ingress.enabled | bool | `false` | Specifies whether a ingress should be created | +| mlflow.ingress.extraHosts | list | `[]` | Extra hosts to configure for the ingress object | +| mlflow.ingress.extraPaths | list | `[]` | Extra paths to configure for the ingress object | +| mlflow.ingress.extraRules | list | `[]` | Extra rules to configure for the ingress object | +| mlflow.ingress.extraTls | list | `[]` | Extra tls hosts to configure for the ingress object | +| mlflow.ingress.hostname | string | `"chart-example.local"` | Ingress hostname | +| mlflow.ingress.path | string | `"/"` | Ingress path | +| mlflow.ingress.pathType | string | `"ImplementationSpecific"` | Ingress path type | +| mlflow.ingress.tls | object | `{"cert":"-----BEGIN CERTIFICATE-----\n-----END CERTIFICATE-----\n","enabled":false,"genSelfSignedCert":false,"key":"-----BEGIN PRIVATE KEY-----\n-----END PRIVATE KEY-----\n"}` | Ingress TLS configuration | +| mlflow.ingress.tls.enabled | bool | `false` | Specifies whether to enable TLS | +| mlflow.ingress.tls.genSelfSignedCert | bool | `false` | Specifies whether to generate self-signed certificate | +| mlflow.labels | object | `{}` | Labels to add to the mlflow deployment | +| mlflow.lifecycle | object | `{}` | Configure the lifecycle for the container | +| mlflow.nodeSelector | object | `{}` | Node selection constraint [[ref]](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector) | +| mlflow.podAntiAffinityMode | string | `"soft"` | Specifies whether podAntiAffinity should be "required" or simply "preferred" This determines if requiredDuringSchedulingIgnoredDuringExecution or preferredDuringSchedulingIgnoredDuringExecution is used [[ref]](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity) | +| mlflow.podAntiAffinityTopologyKey | string | `""` | Enables podAntiAffinity with the specified topology key .mlflow.affinity takes precedence over this setting | +| mlflow.podLabels | object | `{}` | Pod Labels for the mlflow deployment | +| mlflow.podSecurityContext | object | `{}` | Configure the Security Context for the Pod | +| mlflow.priorityClassName | string | `""` | Custom priority class for different treatment by the scheduler | +| mlflow.probes | object | `{"livenessProbe":{},"readinessProbe":{},"startupProbe":{}}` | Specify probes for the container [[ref]](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) | +| mlflow.probes.livenessProbe | object | `{}` | Specify the liveness probes for the container | +| mlflow.probes.readinessProbe | object | `{}` | Specify the readiness probes for the container | +| mlflow.probes.startupProbe | object | `{}` | Specify the startup probes for the container | +| mlflow.replicas | int | `1` | Number of mlflow server replicas to deploy | +| mlflow.resources | object | `{}` | Set the resource requests / limits for the container. | +| mlflow.revisionHistoryLimit | string | `nil` | Deployment revision history limit | +| mlflow.rollingUpdate | object | `{"maxSurge":1,"maxUnavailable":1}` | Rolling update configuration | +| mlflow.rollingUpdate.maxSurge | int | `1` | The maximum number of pods that can be scheduled above the desired number of pods | +| mlflow.rollingUpdate.maxUnavailable | int | `1` | The maximum number of pods that can be unavailable during the update process | +| mlflow.runtimeClassName | string | `""` | Allow specifying a runtimeClassName other than the default one (ie: nvidia) | +| mlflow.schedulerName | string | `""` | Allows specifying a custom scheduler name | +| mlflow.service | object | `{"annotations":{},"name":"http","nodePort":"","port":5000,"type":"ClusterIP"}` | Mlflow Service configuration | +| mlflow.service.annotations | object | `{}` | Annotations to add to the service | +| mlflow.service.name | string | `"http"` | Service port name | +| mlflow.service.nodePort | string | `""` | Service Node port Used when the service type is NodePort or LoadBalancer | +| mlflow.service.port | int | `5000` | Service port number | +| mlflow.service.type | string | `"ClusterIP"` | Specifies which type of service should be created | +| mlflow.serviceAccount | object | `{"annotations":{},"create":true,"name":""}` | Service account configuration for the mlflow deployment | +| mlflow.serviceAccount.annotations | object | `{}` | Annotations to add to the service account if create is true | +| mlflow.serviceAccount.create | bool | `true` | Specifies whether a service account should be created | +| mlflow.serviceAccount.name | string | `""` | Name of the service account to use. If not set and create is true, a name is generated using the fullname template | +| mlflow.strategy | string | `"RollingUpdate"` | Strategy to use to replace existing pods with new ones | +| mlflow.termination.gracePeriodSeconds | string | `nil` | [[ref](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#lifecycle)] | +| mlflow.termination.messagePath | string | `nil` | [[ref](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#lifecycle-1)] | +| mlflow.termination.messagePolicy | string | `nil` | [[ref](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#lifecycle-1)] | +| mlflow.tolerations | list | `[]` | Specify taint tolerations [[ref]](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) | +| mlflow.topologySpreadConstraints | list | `[]` | Defines topologySpreadConstraint rules. [[ref]](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/) | +| mlflow.trackingServer.artifactsDestination | string | `"s3://mlflow"` | Specifies the base artifact location from which to resolve artifact upload/download/list requests (e.g. `s3://my-bucket`) | +| mlflow.trackingServer.basicAuth.createSecret.adminPassword | string | `"password"` | Default admin password if the admin is not already created | +| mlflow.trackingServer.basicAuth.createSecret.adminUsername | string | `"admin"` | Default admin username if the admin is not already created | +| mlflow.trackingServer.basicAuth.createSecret.authorizationFunction | string | `"mlflow.server.auth:authenticate_request_basic_auth"` | Function to authenticate requests | +| mlflow.trackingServer.basicAuth.createSecret.defaultPermission | string | `"READ"` | Default permission on all resources | +| mlflow.trackingServer.basicAuth.enabled | bool | `true` | Specifies whether to enable basic authentication | +| mlflow.trackingServer.basicAuth.existingSecret | string | `""` | Name of an existing secret which contains key `basic_auth.ini` | +| mlflow.trackingServer.defaultArtifactRoot | string | `""` | Specifies a default artifact location for logging, data will be logged to `mlflow-artifacts/:` if artifact serving is enabled, otherwise `./mlruns` | +| mlflow.trackingServer.extraArgs | list | `["--dev"]` | Extra arguments passed to the `mlflow server` command | +| mlflow.trackingServer.host | string | `"0.0.0.0"` | Network address to listen on | +| mlflow.trackingServer.mode | string | `"serve-artifacts"` | Specifies which mode mlflow tracking server run with, available options are `serve-artifacts`, `no-serve-artifacts` and `artifacts-only` | +| mlflow.trackingServer.port | int | `5000` | Port to expose the tracking server | +| mlflow.trackingServer.workers | int | `1` | Number of gunicorn worker processes to handle requests | +| nameOverride | string | `""` | String to override the default generated name | +| postgres | object | `{"auth":{"password":"mlflow","username":"mlflow"},"embedded":{"additionalLabels":{},"affinity":{},"annotations":{},"certificates":{},"enableSuperuserAccess":true,"enabled":true,"image":{"repository":"ghcr.io/cloudnative-pg/postgresql","tag":"15.2"},"imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"initdb":{"database":"mlflow","owner":"mlflow","postInitApplicationSQL":[]},"instances":3,"logLevel":"info","podAntiAffinityMode":"soft","podAntiAffinityTopologyKey":"","postgresGID":26,"postgresUID":26,"postgresql":{},"primaryUpdateMethod":"switchover","primaryUpdateStrategy":"unsupervised","priorityClassName":"","resources":{},"roles":[],"storage":{"size":"10Gi","storageClass":""},"superuserSecret":"","type":"postgresql"},"external":{"database":"mlflow","enabled":false,"host":"","port":5432}}` | Embedded Postrgres configuration Deploys a cluster using the CloudnativePG Operator [[ref]](https://github.com/cloudnative-pg/cloudnative-pg) | +| postgres.auth | object | `{"password":"mlflow","username":"mlflow"}` | Postgres authentication configuration | +| postgres.auth.password | string | `"mlflow"` | Mlflow Tracking Server Postgres password | +| postgres.auth.username | string | `"mlflow"` | Mlflow Tracking Server Postgres username | +| postgres.embedded.additionalLabels | object | `{}` | Addtional labels for Postgres cluster | +| postgres.embedded.affinity | object | `{}` | Affinity/Anti-affinity rules for Pods. See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-AffinityConfiguration | +| postgres.embedded.annotations | object | `{}` | Postgres cluster annotations | +| postgres.embedded.certificates | object | `{}` | The configuration for the CA and related certificates. See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-CertificatesConfiguration | +| postgres.embedded.enableSuperuserAccess | bool | `true` | When this option is enabled, the operator will use the SuperuserSecret to update the postgres user password. If the secret is not present, the operator will automatically create one. When this option is disabled, the operator will ignore the SuperuserSecret content, delete it when automatically created, and then blank the password of the postgres user by setting it to NULL. | +| postgres.embedded.enabled | bool | `true` | Specifies whether to enable the Embedded Postrgres cluster | +| postgres.embedded.image.repository | string | `"ghcr.io/cloudnative-pg/postgresql"` | Image registry | +| postgres.embedded.image.tag | string | `"15.2"` | Image tag | +| postgres.embedded.imagePullPolicy | string | `"IfNotPresent"` | Image pull policy | +| postgres.embedded.imagePullSecrets | list | `[]` | Image pull secrets | +| postgres.embedded.initdb | object | `{"database":"mlflow","owner":"mlflow","postInitApplicationSQL":[]}` | Postgres InitDB configuration | +| postgres.embedded.initdb.database | string | `"mlflow"` | Postgres database name to be initilized | +| postgres.embedded.initdb.owner | string | `"mlflow"` | Postgres username name to be initilized | +| postgres.embedded.initdb.postInitApplicationSQL | list | `[]` | Postgres init application SQL | +| postgres.embedded.instances | int | `3` | Number of Postgres instances to deploy | +| postgres.embedded.logLevel | string | `"info"` | Postgres log level | +| postgres.embedded.podAntiAffinityMode | string | `"soft"` | Specifies whether podAntiAffinity should be "required" or simply "preferred" This determines if requiredDuringSchedulingIgnoredDuringExecution or preferredDuringSchedulingIgnoredDuringExecution is used [[ref]](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity) | +| postgres.embedded.podAntiAffinityTopologyKey | string | `""` | Enables podAntiAffinity with the specified topology key .postgres.embedded.affinity takes precedence over this setting | +| postgres.embedded.postgresGID | int | `26` | Postgres GID | +| postgres.embedded.postgresUID | int | `26` | Postgres UID | +| postgres.embedded.postgresql | object | `{}` | Configuration of the PostgreSQL server. See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration | +| postgres.embedded.primaryUpdateMethod | string | `"switchover"` | Postgres primary update method | +| postgres.embedded.primaryUpdateStrategy | string | `"unsupervised"` | Postgres primary update strategy | +| postgres.embedded.priorityClassName | string | `""` | Postgres priority class name | +| postgres.embedded.resources | object | `{}` | Postgres resources | +| postgres.embedded.roles | list | `[]` | This feature enables declarative management of existing roles, as well as the creation of new roles if they are not already present in the database. See: https://cloudnative-pg.io/documentation/current/declarative_role_management/ | +| postgres.embedded.storage | object | `{"size":"10Gi","storageClass":""}` | Postgres storage configuration | +| postgres.external.database | string | `"mlflow"` | External Postgres database | +| postgres.external.enabled | bool | `false` | Specifies whether to use an external PostgresSQL cluster NOTE: If you enabled External PostgreSQL, you should disable the Embedded PostgreSQL (cluster.enabled: false) | +| postgres.external.host | string | `""` | External Postgres host | +| postgres.external.port | int | `5432` | External Postgres port | +| replicated.enabled | bool | `true` | Specifies whetherto enable the Replicated SDK | + diff --git a/applications/mlflow/charts/mlflow/README.md.gotmpl b/applications/mlflow/charts/mlflow/README.md.gotmpl new file mode 100644 index 00000000..886cebcc --- /dev/null +++ b/applications/mlflow/charts/mlflow/README.md.gotmpl @@ -0,0 +1,204 @@ +{{- define "custom.repository.organization" -}} +replicatedhq +{{- end -}} + +{{- define "custom.repository.url" -}} +https://github.com/replicatedhq/platform-examples +{{- end -}} + +{{- define "custom.helm.url" -}} +https://github.com/replicatedhq/platform-examples/tree/main/applications/mlflow +{{- end -}} + +{{- define "custom.helm.path" -}} +{{ template "custom.repository.organization" . }}/{{ template "chart.name" . }} +{{- end -}} + +{{- define "custom.notes" -}} +{{- end -}} + +{{- define "custom.requirements" -}} +## Requirements + +{{ template "chart.kubeVersionLine" . }} +{{- end -}} + +{{- define "custom.dependencies" -}} +## Dependencies + +{{ template "chart.requirementsTable" . }} +{{- end -}} + +{{- define "custom.install" -}} +## Installing the Chart + +### Prerequisites +- Kubernetes cluster running version 1.19+ +- Helm 3.0+ +- Persistent storage provisioner (for PostgreSQL and MinIO) + +### Quick Start + +```bash +# Add the Replicated registry (if using Replicated) +helm registry login registry.replicated.com --username= + +# Install the chart +helm install mlflow oci://registry.replicated.com/your-app/your-channel/mlflow +``` + +### From Local Chart + +```bash +# Clone the repository +git clone https://github.com/replicatedhq/platform-examples.git +cd platform-examples/applications/mlflow + +# Install dependencies +helm dependency update ./charts/mlflow + +# Install the chart +helm install mlflow ./charts/mlflow --namespace mlflow --create-namespace +``` +{{- end -}} + +{{- define "custom.usage" -}} +## Usage + +### Accessing MLflow + +After deploying MLflow, you can access the web UI by port-forwarding the service: + +```bash +kubectl port-forward -n mlflow svc/mlflow 5000:5000 +``` + +Then navigate to http://localhost:5000 in your browser. +{{- end -}} + +{{- define "custom.features" -}} +## Features + +- **Tracking Server**: Central interface for logging parameters, metrics, and artifacts +- **Model Registry**: Repository for managing the full lifecycle of MLflow Models +- **PostgreSQL**: Persistent storage for experiment and run data +- **MinIO**: S3-compatible storage for model artifacts +- **Replicated Integration**: Support for distribution through the Replicated platform +{{- end -}} + +{{- define "custom.configuration" -}} +## Configuration + +{{ template "custom.config.introduction" . }} + +For detailed configuration options, see the [Configuration Reference](./README_CONFIG.md). + +### Basic Configuration + +#### Minimum Configuration + +```yaml +# Minimal configuration example +postgresql: + auth: + password: "securePassword" # Required for security +minio: + auth: + rootPassword: "securePassword" # Required for security +``` + +#### Common Configuration Options + +```yaml +# Common options +mlflow: + # Set resources for MLflow server + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + + # Configure basic authentication + auth: + enabled: true + username: admin + password: password +``` + +For complete configuration options including external services, security settings, and advanced features, see the [Configuration Reference](./README_CONFIG.md). +{{- end -}} + +{{- define "custom.uninstall" -}} +## Uninstalling the Chart + +```bash +helm uninstall mlflow -n mlflow +``` +{{- end -}} + +{{- define "custom.support" -}} +## Support + +For support with this chart, please visit the [Replicated Community](https://community.replicated.com/). +{{- end -}} + +{{- define "custom.changelogLink" -}} +## Changelog + +The changelog for this chart is maintained in [README_CHANGELOG.md](./README_CHANGELOG.md). +{{- end -}} + +{{- define "chart.valuesSection" -}} +## Values + +The configuration values for this chart are documented in a separate file due to their size and complexity. +See [Configuration Reference](./README_CONFIG.md) for detailed descriptions of all available options. +{{- end -}} + +{{- define "chart.versionFooter" -}} +## Version + +Helm Chart version: {{ .Version }} +{{- end -}} + +{{ template "chart.header" . }} + +{{ template "chart.description" . }} + +{{ template "chart.versionBadge" . }}{{ template "chart.typeBadge" . }}{{ template "chart.appVersionBadge" . }} + +## Introduction + +MLflow is an open-source platform for managing the machine learning lifecycle, including: +- Experiment tracking: Record and compare parameters, data, code, and results +- Model registry: Store, annotate, discover, and manage models in a central repository +- Model serving: Deploy models in diverse serving environments + +This Helm chart deploys MLflow with a PostgreSQL database for tracking and MinIO for artifact storage. + +{{ template "custom.notes" . }} + +{{ template "chart.sourcesSection" . }} + +{{ template "custom.requirements" . }} + +{{ template "custom.dependencies" . }} + +{{ template "custom.install" . }} + +{{ template "custom.usage" . }} + +{{ template "custom.features" . }} + +{{ template "custom.configuration" . }} + +{{ template "custom.uninstall" . }} + +{{ template "custom.changelogLink" . }} + +{{ template "custom.support" . }} + +{{ template "helm-docs.versionFooter" . }} diff --git a/applications/mlflow/charts/mlflow/README_CHANGELOG.md.gotmpl b/applications/mlflow/charts/mlflow/README_CHANGELOG.md.gotmpl new file mode 100644 index 00000000..221ddd63 --- /dev/null +++ b/applications/mlflow/charts/mlflow/README_CHANGELOG.md.gotmpl @@ -0,0 +1,46 @@ +{{- define "custom.changelog.header" -}} +# Changelog + +This file documents all notable changes to the MLflow Helm chart. The release numbering follows [Semantic Versioning](https://semver.org/). +{{- end -}} + +{{- define "custom.changelog" -}} +{{ template "custom.changelog.header" . }} + +## [{{ .Version }}] + +### Added +- Added support for Replicated distribution +- Integrated with Replicated SDK +- Added example ML workflow for testing + +### Changed +- Updated MLflow to version {{ .AppVersion }} +- Improved PostgreSQL configuration options +- Enhanced MinIO integration + +## [0.3.0] + +### Added +- Support for artifact proxying through the tracking server +- Additional security options for production deployments + +### Fixed +- Connection issues with PostgreSQL backend +- Artifact storage path handling in MinIO + +## [0.2.0] + +### Added +- Integration with PostgreSQL for metadata storage +- MinIO support for artifact storage +- Helm test for verifying deployment + +## [0.1.0] + +### Added +- Initial chart release +- Basic MLflow tracking server deployment +- Support for configurable persistence + +{{- end -}} \ No newline at end of file diff --git a/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl b/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl new file mode 100644 index 00000000..0876aa4e --- /dev/null +++ b/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl @@ -0,0 +1,139 @@ +{{- define "custom.config.header" -}} +# Configuration + +This document outlines the configuration options for the MLflow Helm chart. +{{- end -}} + +{{- define "custom.config.introduction" -}} +The following table lists the configurable parameters for the MLflow chart and their default values. +{{- end -}} + +{{- define "custom.config.advanced" -}} +## Advanced Configuration + +### PostgreSQL Configuration + +The chart uses PostgreSQL for storing MLflow metadata. You can configure the database connection using: + +```yaml +postgresql: + enabled: true + auth: + username: mlflow + password: mlflowpassword + database: mlflow + primary: + persistence: + size: 10Gi +``` + +### MinIO Configuration + +MinIO is used for artifact storage. Configure it with: + +```yaml +minio: + enabled: true + auth: + rootUser: minioadmin + rootPassword: minioadmin + persistence: + size: 20Gi + defaultBuckets: "mlflow" +``` +{{- end -}} + +{{- define "custom.config.externalServices" -}} +### Using External Storage + +To use external PostgreSQL: + +```yaml +postgresql: + enabled: false + +mlflow: + backendStore: + databaseUri: "postgresql://user:password@external-postgresql:5432/mlflow" +``` + +To use external S3-compatible storage: + +```yaml +minio: + enabled: false + +mlflow: + artifactRoot: + s3: + enabled: true + bucket: "mlflow" + endpoint: "s3.amazonaws.com" + accessKey: "your-access-key" + secretKey: "your-secret-key" + region: "us-east-1" +``` +{{- end -}} + +{{- define "custom.config.replicated" -}} +### Replicated SDK Integration + +Enable or disable the Replicated SDK integration: + +```yaml +replicated: + enabled: true +``` + +For development environments, you'll typically want to disable this: + +```yaml +replicated: + enabled: false +``` +{{- end -}} + +{{- define "custom.config.security" -}} +### Security Considerations + +By default, this chart doesn't include authentication. In production, consider: + +1. Using an ingress with authentication +2. Setting up TLS encryption +3. Configuring username/password protection + +Example ingress configuration with TLS: + +```yaml +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: nginx + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - host: mlflow.example.com + paths: + - path: / + pathType: Prefix + tls: + - secretName: mlflow-tls + hosts: + - mlflow.example.com +``` +{{- end -}} + +{{ template "custom.config.header" . }} + +{{ template "custom.config.introduction" . }} + +{{ template "custom.config.advanced" . }} + +{{ template "custom.config.externalServices" . }} + +{{ template "custom.config.replicated" . }} + +{{ template "custom.config.security" . }} + +{{ template "chart.valuesTable" . }} + +{{ template "chart.maintainersSection" . }} From 72dc308d411747919151f5eb7b16714bcc283e5c Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Mon, 14 Apr 2025 12:39:47 -0400 Subject: [PATCH 08/18] improve docs --- applications/mlflow/README.md | 20 + .../charts/mlflow/README_CONFIG.md.gotmpl | 668 +++++++++++++++++- 2 files changed, 654 insertions(+), 34 deletions(-) diff --git a/applications/mlflow/README.md b/applications/mlflow/README.md index d14e8b78..d0fc8167 100644 --- a/applications/mlflow/README.md +++ b/applications/mlflow/README.md @@ -90,9 +90,29 @@ The solution architecture consists of: - **MLflow Server**: Core MLflow tracking and registry services - **PostgreSQL**: Metadata storage for experiments, runs, and models + - Embedded PostgreSQL (default): Automatically deployed with the chart + - External PostgreSQL (optional): Connect to your existing database - **MinIO**: S3-compatible storage for artifacts and model files + - Embedded MinIO (default): Automatically deployed with the chart + - External S3-compatible storage (optional): Connect to your existing object storage - **Replicated Integration**: Management layer for installation and updates +### Storage Options + +This solution offers flexibility in how you store MLflow data: + +#### Metadata Storage + +- **Embedded PostgreSQL** (Default): Simplifies deployment with an automatically managed database +- **External PostgreSQL**: Connect to your existing PostgreSQL instance for better control, scaling, and integration with your infrastructure + +#### Artifact Storage + +- **Embedded MinIO** (Default): Provides S3-compatible storage within the deployment +- **External S3-compatible Storage**: Store artifacts in your own S3, GCS, or other S3-compatible storage service + +See the [Configuration Reference](./charts/mlflow/README_CONFIG.md) for detailed setup instructions. + ## Getting Started ### Prerequisites diff --git a/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl b/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl index 0876aa4e..85eb243a 100644 --- a/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl +++ b/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl @@ -13,18 +13,31 @@ The following table lists the configurable parameters for the MLflow chart and t ### PostgreSQL Configuration -The chart uses PostgreSQL for storing MLflow metadata. You can configure the database connection using: +The chart uses PostgreSQL for storing MLflow metadata. You can configure the embedded PostgreSQL database using: ```yaml -postgresql: - enabled: true - auth: - username: mlflow - password: mlflowpassword - database: mlflow - primary: - persistence: +postgres: + embedded: + # Enable embedded PostgreSQL + enabled: true + # Number of PostgreSQL instances for high availability + instances: 3 + # Database name + initdb: + database: mlflow + owner: mlflow + # Storage configuration + storage: size: 10Gi + storageClass: "" + # Resource configuration + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" ``` ### MinIO Configuration @@ -34,45 +47,99 @@ MinIO is used for artifact storage. Configure it with: ```yaml minio: enabled: true - auth: - rootUser: minioadmin - rootPassword: minioadmin - persistence: - size: 20Gi - defaultBuckets: "mlflow" + # Authentication credentials + secrets: + name: myminio-env-configuration + accessKey: minio + secretKey: minio1234 + # MinIO tenant configuration + tenant: + # Resource pool configuration + pools: + pool0: + servers: 3 + volumesPerServer: 4 + size: 10Gi + resources: + requests: + memory: "1Gi" + cpu: "250m" + # Create a bucket during provisioning + buckets: + - name: mlflow ``` {{- end -}} {{- define "custom.config.externalServices" -}} ### Using External Storage -To use external PostgreSQL: +#### External PostgreSQL -```yaml -postgresql: - enabled: false +To use an external PostgreSQL database instead of the embedded one: +```yaml +# Disable the embedded PostgreSQL +postgres: + embedded: + enabled: false + external: + enabled: true + # External PostgreSQL connection details + host: "external-postgresql-host" + port: 5432 + database: "mlflow" + +# Configure MLflow backend store mlflow: backendStore: - databaseUri: "postgresql://user:password@external-postgresql:5432/mlflow" + # Option 1: Using a full database URI + databaseUri: "postgresql://user:password@external-postgresql-host:5432/mlflow" + # OR Option 2: The connection details will be auto-configured from postgres.external values ``` -To use external S3-compatible storage: +Make sure your external database is accessible from your Kubernetes cluster and has the necessary permissions for MLflow to create its schema. + +#### External S3-compatible Storage + +To use external S3-compatible storage for MLflow artifacts: ```yaml +# Disable the embedded MinIO minio: enabled: false +# Configure MLflow artifact store to use external S3 mlflow: - artifactRoot: + # Set the artifact destination + trackingServer: + artifactsDestination: "s3://my-external-bucket/mlflow" + + # Configure S3 artifact store + artifactStore: s3: enabled: true - bucket: "mlflow" - endpoint: "s3.amazonaws.com" - accessKey: "your-access-key" - secretKey: "your-secret-key" - region: "us-east-1" + # Option 1: Using an existing secret with AWS credentials + existingSecret: "my-aws-secret" + # OR Option 2: Provide credentials directly (not recommended for production) + # accessKeyId: "your-access-key" + # secretAccessKey: "your-secret-key" + + # Configure external S3 details + external: + enabled: true + protocol: https + host: "s3.amazonaws.com" # For AWS S3 + # host: "storage.googleapis.com" # For Google Cloud Storage + port: 443 + ignoreTls: false ``` + +This configuration works with any S3-compatible storage, including: +- Amazon S3 +- Google Cloud Storage (with interoperability enabled) +- MinIO (self-hosted) +- DigitalOcean Spaces +- And other compatible services {{- end -}} {{- define "custom.config.replicated" -}} @@ -94,17 +161,51 @@ replicated: {{- end -}} {{- define "custom.config.security" -}} -### Security Considerations +### Security Configurations + +Secure your MLflow deployment with the following configuration options. + +#### Authentication and Authorization -By default, this chart doesn't include authentication. In production, consider: +MLflow supports several authentication methods: + +```yaml +# Basic Auth +mlflow: + auth: + enabled: true + type: "basic" + users: + - username: admin + password: "" # Will generate a random password if empty + isAdmin: true + - username: readonly + password: "example-password" # Not recommended, use secrets instead + isAdmin: false + +# OIDC/OAuth2 Integration +mlflow: + auth: + enabled: true + type: "oauth" + oauth: + clientId: "mlflow-client" + clientSecret: "" # Use secretRef instead for production + secretRef: + name: "mlflow-oauth-secret" + key: "client-secret" + provider: "keycloak" # or "okta", "auth0", etc. + issuerUrl: "https://keycloak.example.com/auth/realms/mlflow" + redirectUri: "https://mlflow.example.com/oauth/callback" + scopes: "openid profile email" +``` -1. Using an ingress with authentication -2. Setting up TLS encryption -3. Configuring username/password protection +#### Network Security -Example ingress configuration with TLS: +Secure communication with TLS and network policies: ```yaml +# TLS Configuration ingress: enabled: true annotations: @@ -114,14 +215,505 @@ ingress: - host: mlflow.example.com paths: - path: / - pathType: Prefix tls: - secretName: mlflow-tls hosts: - mlflow.example.com + +# Network Policies +networkPolicies: + enabled: true + # Only allow traffic from specific namespaces + ingressFrom: + - namespaceSelector: + matchLabels: + name: data-science + - namespaceSelector: + matchLabels: + name: ml-pipeline +``` + +#### Secrets Management + +Use Kubernetes secrets for sensitive information: + +```yaml +secretsManager: + enabled: true + # Integrate with external secrets providers + externalSecrets: + enabled: true + backend: "aws-secretsmanager" # or "vault", "gcp-secretmanager" + secretMapping: + - secretName: "mlflow-database-credentials" + externalName: "prod/mlflow/db-credentials" + - secretName: "mlflow-s3-credentials" + externalName: "prod/mlflow/s3-credentials" +``` + +#### Pod Security Context + +Set security contexts for pods and containers: + +```yaml +securityContext: + # Pod-level security context + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + +# Container-level security context +containerSecurityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL +``` +{{- end -}} + +{{- define "custom.config.resources" -}} +### Resource Configurations + +Configure appropriate resources for MLflow server and its dependencies to ensure optimal performance. + +#### MLflow Server Resources + +```yaml +mlflow: + # Configure resources for the MLflow server + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 500m + memory: 512Mi +``` + +#### PostgreSQL Resources (when using embedded PostgreSQL) + +```yaml +postgresql: + # Configure resources for the PostgreSQL server + primary: + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 250m + memory: 256Mi +``` + +#### MinIO Resources (when using embedded MinIO) + +```yaml +minio: + # Configure resources for the MinIO server + resources: + limits: + cpu: 1000m + memory: 2Gi + requests: + cpu: 250m + memory: 512Mi +``` + +#### Advanced Configurations + +For high-traffic environments, increase resource allocations and consider enabling autoscaling: + +```yaml +mlflow: + resources: + limits: + cpu: 2000m + memory: 4Gi + requests: + cpu: 1000m + memory: 2Gi + + # Configure horizontal pod autoscaling + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 +``` +{{- end -}} + +{{- define "custom.config.persistence" -}} +### Persistence Configurations + +Configure persistent storage for MLflow artifacts and databases to ensure data durability. + +#### Artifact Storage + +Configure MinIO or other S3-compatible storage for MLflow artifacts: + +```yaml +# Using embedded MinIO (default) +minio: + enabled: true + persistence: + enabled: true + size: 10Gi + storageClass: "standard" + # Improve reliability with distributed setup + mode: distributed + replicas: 4 + +# Or configure external S3-compatible storage +externalS3: + enabled: true + endpoint: "s3.amazonaws.com" + bucket: "mlflow-artifacts" + region: "us-west-2" + # Use Kubernetes secrets for credentials + secretName: "s3-credentials" +``` + +#### Database Persistence + +Configure PostgreSQL persistence for MLflow metadata: + +```yaml +# Using embedded PostgreSQL (default) +postgresql: + enabled: true + persistence: + enabled: true + size: 8Gi + storageClass: "standard" + # Optional high-availability settings + primary: + persistence: + enabled: true + size: 8Gi + + # For production, consider configuring backups + backup: + enabled: true + schedule: "0 0 * * *" # Daily backup at midnight + storage: + storageClass: "standard" + size: 10Gi + +# Or configure external PostgreSQL +externalPostgresql: + enabled: true + host: "postgresql.database.svc.cluster.local" + port: 5432 + database: "mlflow" + # Use Kubernetes secrets for credentials + secretName: "postgresql-credentials" +``` + +#### Backing Up and Restoring Data + +For critical deployments, configure regular backups: + +```yaml +backup: + enabled: true + schedule: "0 0 * * *" # Daily backup at midnight + retention: 7 # Keep 7 days of backups + destination: + s3: + bucket: "mlflow-backups" + region: "us-west-2" + secretName: "backup-credentials" ``` {{- end -}} +{{- define "custom.config.monitoring" -}} +### Monitoring and Observability + +Configure monitoring for your MLflow deployment to ensure optimal performance and reliability. + +#### Prometheus Metrics + +Enable Prometheus metrics collection for MLflow components: + +```yaml +metrics: + enabled: true + serviceMonitor: + enabled: true + # If using Prometheus Operator + additionalLabels: + release: prometheus + # Scrape interval + interval: 30s + # Metrics path + path: /metrics + # Metrics port + port: metrics +``` + +#### Logging Configuration + +Configure logging levels and output formats: + +```yaml +logging: + # Global log level + level: INFO # DEBUG, INFO, WARNING, ERROR + # Log format options + format: json # or text + # Retention configuration + retention: + days: 7 + maxSize: 500Mi +``` + +#### Grafana Dashboards + +Automatically provision Grafana dashboards for MLflow metrics: + +```yaml +dashboards: + enabled: true + # Label that Grafana uses to discover dashboards + label: grafana_dashboard + # Annotations + annotations: + grafana_folder: MLflow + # Dashboard configurations + mlflowDashboard: + enabled: true + databaseDashboard: + enabled: true + artifactStoreDashboard: + enabled: true +``` + +#### Alerts and Notifications + +Configure alerts for critical metrics: + +```yaml +alerts: + enabled: true + # Configure alert rules + rules: + highCpuUsage: + expr: 'avg(rate(container_cpu_usage_seconds_total{container="mlflow"}[5m])) > 0.8' + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage for MLflow" + description: "MLflow has high CPU usage (> 80%) for the last 10 minutes" + + highMemoryUsage: + expr: 'avg(container_memory_usage_bytes{container="mlflow"}) / avg(container_spec_memory_limit_bytes{container="mlflow"}) > 0.9' + for: 10m + labels: + severity: warning + annotations: + summary: "High memory usage for MLflow" + description: "MLflow is using over 90% of its memory allocation" + + # Configure alert receivers + receivers: + slack: + enabled: true + channel: "#mlflow-alerts" + webhookUrl: "" # Use secretRef in production + secretRef: + name: slack-webhook + key: url + email: + enabled: false + to: "mlops-team@example.com" +``` + +#### Tracing + +Enable distributed tracing for MLflow requests: + +```yaml +tracing: + enabled: true + # Supported providers: jaeger, zipkin, datadog + provider: jaeger + jaeger: + endpoint: "http://jaeger-collector:14268/api/traces" + samplingRate: 0.1 # Sample 10% of requests + # Sample rate configuration + sampler: + type: const + param: 1 # 1 = sample all, lower values sample less +``` +{{- end -}} + +{{- define "custom.config.ha" -}} +### High Availability and Scaling + +Configure MLflow for high availability and optimal performance at scale. + +#### Horizontal Pod Autoscaling + +Enable automatic scaling based on resource utilization: + +```yaml +autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 25 + periodSeconds: 60 + scaleUp: + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 100 + periodSeconds: 60 +``` + +#### Multi-Zone Deployment + +Configure pod anti-affinity for high availability across zones: + +```yaml +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - mlflow + topologyKey: "topology.kubernetes.io/zone" + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/role + operator: In + values: + - mlflow-nodes +``` + +#### Replicas and Load Balancing + +Configure the number of replicas and load balancing strategies: + +```yaml +replicaCount: 3 + +service: + type: ClusterIP + port: 80 + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: nlb + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true" + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 10800 # 3 hours +``` + +#### Pod Disruption Budget + +Define a Pod Disruption Budget to ensure availability during voluntary disruptions: + +```yaml +podDisruptionBudget: + enabled: true + minAvailable: 1 + # Or use maxUnavailable instead + # maxUnavailable: 1 +``` + +#### Database High Availability + +Configure database for high availability: + +```yaml +postgresql: + enabled: true + architecture: replication + auth: + username: mlflow + database: mlflow + primary: + replicaCount: 1 + persistence: + enabled: true + size: 10Gi + readReplicas: + replicaCount: 2 + persistence: + enabled: true + size: 10Gi + metrics: + enabled: true + volumePermissions: + enabled: true +``` + +#### Connection Pooling + +Configure connection pooling for database access: + +```yaml +connectionPooling: + enabled: true + maxConnections: 100 + minConnections: 5 + maxConnectionAge: 600 # 10 minutes + connectionTimeout: 30 # 30 seconds + poolSize: 20 +``` + +#### Resource Allocation + +Configure resource requests and limits appropriate for high-traffic environments: + +```yaml +resources: + limits: + cpu: 2000m + memory: 4Gi + requests: + cpu: 500m + memory: 1Gi + +# Job specific resources +jobs: + resources: + limits: + cpu: 1000m + memory: 2Gi + requests: + cpu: 250m + memory: 512Mi +``` +{{- end -}} + +{{- define "custom.config.all" -}} +{{- template "custom.config.resources" . -}} +{{- template "custom.config.persistence" . -}} +{{- template "custom.config.security" . -}} +{{- template "custom.config.monitoring" . -}} +{{- template "custom.config.ha" . -}} +{{- template "chart.valuesTable" . -}} +{{- end -}} + {{ template "custom.config.header" . }} {{ template "custom.config.introduction" . }} @@ -134,6 +726,14 @@ ingress: {{ template "custom.config.security" . }} +{{ template "custom.config.resources" . }} + +{{ template "custom.config.persistence" . }} + +{{ template "custom.config.monitoring" . }} + +{{ template "custom.config.ha" . }} + {{ template "chart.valuesTable" . }} {{ template "chart.maintainersSection" . }} From e6cc72efe70ac917f47ab5828980d410b060592c Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 12:24:36 -0400 Subject: [PATCH 09/18] readme updates --- .github/workflows/mlflow-ci.yml | 71 ++++++- applications/mlflow/Taskfile.yml | 199 ++++++++++++++++++ .../charts/mlflow/README_CONFIG.md.gotmpl | 33 +++ 3 files changed, 302 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 7f94c44a..782715c0 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -66,9 +66,78 @@ jobs: # Ensure Chart.yaml and HelmChart versions are in sync task check:versions + helm-docs: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install Task + uses: arduino/setup-task@v1 + with: + version: 3.x + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Helm + uses: azure/setup-helm@v4.3.0 + with: + version: v3.13.3 + + - name: Install helm-docs + run: | + HELM_DOCS_VERSION=v1.12.0 + wget https://github.com/norwoodj/helm-docs/releases/download/${HELM_DOCS_VERSION}/helm-docs_${HELM_DOCS_VERSION#v}_Linux_x86_64.tar.gz -O - | tar -xz + sudo mv helm-docs /usr/local/bin/helm-docs + helm-docs --version + + - name: Check Helm Documentation + working-directory: applications/mlflow + run: | + # Use Taskfile to check if helm docs are up to date + task add:repos:helm + task update:deps:helm + task docs:helm:check + + - name: Generate Helm Documentation + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + working-directory: applications/mlflow + run: | + # Only generate documentation on main branch pushes + task docs:helm:generate + + - name: Generate KOTS Manifest Guide + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + working-directory: applications/mlflow + run: | + # Generate KOTS manifest guide + task docs:kots:summary + + - name: Create PR if docs changed + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: "docs: update documentation" + title: "docs: update documentation" + body: | + This PR updates documentation: + + - Updated Helm chart documentation based on the current templates + - Generated KOTS manifest guide for platform engineers + + Automatically generated by the MLflow CI workflow. + branch: update-docs + base: main + labels: documentation + paths: | + applications/mlflow/charts/*/README.md + applications/mlflow/docs/KOTS_MANIFEST_GUIDE.md + create-release: runs-on: ubuntu-22.04 - needs: [lint-and-template] + needs: [lint-and-template, helm-docs] outputs: customer-id: ${{ steps.create-customer.outputs.customer-id }} channel-slug: ${{ steps.create-release.outputs.channel-slug }} diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index 2a6c4d21..beca5c49 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -795,6 +795,205 @@ tasks: cmds: - echo "All tests completed successfully" + # Documentation generation tasks + docs:helm:generate: + desc: Generate Helm chart documentation from templates + deps: [add:repos:helm, update:deps:helm] + cmds: + - echo "Generating Helm chart documentation..." + - | + # Make sure helm-docs is installed + if ! command -v helm-docs &> /dev/null; then + echo "❌ helm-docs is not installed. Please install it from https://github.com/norwoodj/helm-docs" + exit 1 + fi + + # Run helm-docs for each chart + for chart in {{.CHARTS}}; do + echo "Generating documentation for $chart chart..." + cd {{.CHART_DIR}}/$chart && helm-docs -t README.md.gotmpl -t README_CHANGELOG.md.gotmpl -t README_CONFIG.md.gotmpl + done + + echo "✅ Helm chart documentation generated successfully." + + docs:helm:check: + desc: Check if Helm chart documentation is up to date + deps: [add:repos:helm, update:deps:helm] + cmds: + - echo "Checking if Helm chart documentation is up to date..." + - | + # Make sure helm-docs is installed + if ! command -v helm-docs &> /dev/null; then + echo "❌ helm-docs is not installed. Please install it from https://github.com/norwoodj/helm-docs" + exit 1 + fi + + docs_outdated=false + + # For each chart, generate docs to a temp dir and compare with current docs + for chart in {{.CHARTS}}; do + echo "Checking documentation for $chart chart..." + + # Create temp directory + tmp_dir=$(mktemp -d) + trap 'rm -rf "$tmp_dir"' EXIT + + # Copy current README.md to temp dir + readme_path="{{.CHART_DIR}}/$chart/README.md" + tmp_readme="$tmp_dir/README.md" + + if [ -f "$readme_path" ]; then + cp "$readme_path" "$tmp_readme" + else + echo "⚠️ README.md not found for $chart chart. This check will only be useful after docs are generated." + touch "$tmp_readme" # Create empty file for comparison + fi + + # Generate fresh docs + cd {{.CHART_DIR}}/$chart && helm-docs -t README.md.gotmpl -t README_CHANGELOG.md.gotmpl -t README_CONFIG.md.gotmpl -o "$tmp_dir" + + # Compare with current docs + if [ -f "$readme_path" ] && ! diff -q "$readme_path" "$tmp_readme" > /dev/null; then + echo "❌ Documentation for $chart chart is outdated. Run 'task docs:helm:generate' to update." + docs_outdated=true + else + echo "✅ Documentation for $chart chart is up to date." + fi + done + + # Exit with error if any docs are outdated + if [ "$docs_outdated" = true ]; then + echo "❌ Some chart documentation files are outdated. Run 'task docs:helm:generate' to update them." + exit 1 + else + echo "✅ All chart documentation is up to date." + fi + + docs:kots:summary: + desc: Generate a summary of KOTS manifest files for platform engineers + cmds: + - echo "Generating KOTS manifest summary in docs/KOTS_MANIFEST_GUIDE.md..." + - | + # Create docs directory if it doesn't exist + mkdir -p docs + + # Generate the summary document + cat > docs/KOTS_MANIFEST_GUIDE.md << 'EOF' + # MLflow KOTS Manifest Guide + + This document provides a technical overview of the Replicated KOTS manifests used to package and deliver the MLflow application. + + ## Overview + + The manifests in the `applications/mlflow/kots` directory define how MLflow is packaged, configured, and deployed using the Replicated platform. + + ## Key Files and Their Purpose + + ### kots-app.yaml + + Defines the core application properties: + - Application metadata (name, icon, description) + - Status informers for monitoring deployment health + - Port definitions for services + - Release notes handling + + ### kots-config.yaml + + Contains all user-configurable settings presented during installation: + - Database configuration (embedded PostgreSQL or external) + - S3 storage settings (embedded MinIO or external S3) + - Networking and ingress configuration + - Resource allocation settings + + Each configuration option includes: + - Type (string, boolean, password, etc.) + - Default values + - Validation rules + - Help text for users + - Dependencies/when conditions + + ### mlflow-chart.yaml + + A HelmChart custom resource that: + - References the MLflow Helm chart + - Maps configuration values from user inputs to Helm values + - Defines conditional logic for different deployment scenarios + - Uses templating functions to insert configuration values + + ### infra-chart.yaml + + Similar to mlflow-chart.yaml but for infrastructure components: + - Configures supporting services (databases, object storage) + - Typically deployed before the main application + + ### kots-preflight.yaml + + Defines preflight checks to validate the environment before installation: + - Kubernetes version compatibility + - Resource availability (CPU, memory) + - Namespace access permissions + - Storage class availability + + ### k8s-app.yaml + + Kubernetes Application custom resource for discovery and management. + + ### ec.yaml + + EntitlementSpec that controls: + - License entitlements and limits + - Feature flags based on license tier + - Usage restrictions + + ## Best Practices for Modifications + + When making changes to these files: + + 1. **Version Consistency**: Update both Helm chart versions and kots-chart references + 2. **Testing**: Test with both new installations and upgrades + 3. **Config Options**: When adding new config options: + - Provide meaningful default values + - Include clear help text + - Consider when dependencies for conditional display + 4. **Templates**: Use consistent templating patterns + 5. **Preflight Checks**: Update preflight checks when requirements change + + ## Common Tasks + + ### Adding a New Configuration Option + + 1. Add the option to `kots-config.yaml` with appropriate metadata + 2. Reference the value in `mlflow-chart.yaml` using template functions + 3. Update preflight checks if the option affects requirements + + ### Updating Helm Chart Versions + + 1. Update the chartVersion in both infra-chart.yaml and mlflow-chart.yaml + 2. Run `task update:versions:chart` to ensure consistency + 3. Test both installation and upgrade scenarios + + ### Adding Support for a New Feature + + 1. First, implement the feature in the Helm chart + 2. Add any new configuration options to kots-config.yaml + 3. Connect the configuration to the Helm values in the chart files + 4. Optionally add preflight checks for any new requirements + 5. Test with both embedded and external dependencies + + ## Troubleshooting + + Common issues when working with these files: + + - **Template Rendering Errors**: Check template syntax in .yaml files + - **Preflight Check Failures**: Ensure requirements are correctly specified + - **Configuration Mismatches**: Verify config option names match between files + - **Upgrade Issues**: Test upgrades from earlier versions + + For more detailed information, refer to the [Replicated KOTS documentation](https://docs.replicated.com/vendor/packaging-kots-apps). + EOF + + echo "✅ KOTS manifest summary generated in docs/KOTS_MANIFEST_GUIDE.md" + # Version extraction extract:version:chart: desc: Extract and print the MLflow chart version diff --git a/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl b/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl index 85eb243a..e7a45dc3 100644 --- a/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl +++ b/applications/mlflow/charts/mlflow/README_CONFIG.md.gotmpl @@ -705,12 +705,43 @@ jobs: ``` {{- end -}} +{{- define "custom.understanding.platform" -}} +### Understanding Platform Integration Files + +This section describes the KOTS manifest files used for platform integration in the `applications/mlflow/kots` directory. These files enable MLflow to be deployed through the Replicated platform. + +#### KOTS Manifest Files + +| File | Description | +| ---- | ----------- | +| `kots-app.yaml` | Defines the application metadata for KOTS, including title, icon, status informers, and ports. | +| `kots-config.yaml` | Contains all configurable options presented to the user during installation, organized in groups like database settings, S3 storage, and networking configuration. | +| `mlflow-chart.yaml` | A HelmChart custom resource that integrates the MLflow Helm chart with KOTS, connecting user configuration options to Helm values. | +| `infra-chart.yaml` | A HelmChart custom resource for infrastructure components that MLflow depends on. | +| `kots-preflight.yaml` | Defines preflight checks that run before installation to validate the environment meets requirements. | +| `kots-support-bundle.yaml` | Configures support bundle collection for troubleshooting. | +| `k8s-app.yaml` | Kubernetes Application custom resource definition. | +| `ec.yaml` | EntitlementSpec that defines license entitlements and limits. | + +#### Integration Pattern + +These files work together to create an integrated experience: + +1. The user configures settings through the options defined in `kots-config.yaml` +2. The values are injected into the Helm charts via template functions in `mlflow-chart.yaml` and `infra-chart.yaml` +3. Preflight checks in `kots-preflight.yaml` ensure the environment is properly set up +4. Deployment status is tracked via the informers defined in `kots-app.yaml` + +When making changes to the MLflow Helm chart, corresponding updates may be needed in the KOTS manifests to ensure proper integration. +{{- end -}} + {{- define "custom.config.all" -}} {{- template "custom.config.resources" . -}} {{- template "custom.config.persistence" . -}} {{- template "custom.config.security" . -}} {{- template "custom.config.monitoring" . -}} {{- template "custom.config.ha" . -}} +{{- template "custom.understanding.platform" . -}} {{- template "chart.valuesTable" . -}} {{- end -}} @@ -734,6 +765,8 @@ jobs: {{ template "custom.config.ha" . }} +{{ template "custom.understanding.platform" . }} + {{ template "chart.valuesTable" . }} {{ template "chart.maintainersSection" . }} From e142f57b3c90003990d1927cb0a18dea7f45e064 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 12:36:31 -0400 Subject: [PATCH 10/18] break down port forward task --- applications/mlflow/Taskfile.yml | 327 +++++++++---------- applications/mlflow/charts/mlflow/Chart.lock | 6 +- 2 files changed, 161 insertions(+), 172 deletions(-) diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index beca5c49..ad1277b0 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -436,6 +436,9 @@ tasks: echo "No custom values file provided. Using default values." fi + # Clean up any existing port forwarding first + task cleanup:port:forward || true + # Install infra chart from Replicated registry echo "Installing infra chart from Replicated registry..." echo "Chart path: $OCI_URL/infra" @@ -485,6 +488,9 @@ tasks: exit 1 fi + # Clean up any existing port forwarding first + task cleanup:port:forward || true + # Create directory for license file if it doesn't exist mkdir -p /tmp/replicated LICENSE_FILE="/tmp/replicated/license.yaml" @@ -547,12 +553,12 @@ tasks: echo "✅ KOTS installation completed. Setting up port forwarding for testing..." - task: forward:port - # Port forwarding task - forward:port: - desc: Setup port forwarding to MLflow service for testing + # Port forwarding task - now broken into smaller sub-tasks + find:mlflow:service: + desc: Find the MLflow service in the namespace internal: true cmds: - - echo "Setting up port forwarding to MLflow service..." + - echo "Looking for MLflow service in namespace {{.NAMESPACE}}..." - | # Wait for the MLflow service to be created echo "Waiting for MLflow service to be created..." @@ -588,7 +594,14 @@ tasks: # Verify the services are present echo "Verifying MLflow service exists..." kubectl get svc -n {{.NAMESPACE}} - + echo "✅ MLflow service verification completed." + + wait:mlflow:pods: + desc: Wait for MLflow pods to be ready + internal: true + cmds: + - echo "Checking MLflow pod status..." + - | # Check pod status and wait for them to be running echo "Checking pod status..." kubectl get pods -n {{.NAMESPACE}} @@ -598,7 +611,14 @@ tasks: echo "WARNING: Timed out waiting for pods to be ready, will try port-forwarding anyway" kubectl describe pods -n {{.NAMESPACE}} } - + echo "✅ Pod readiness check completed." + + setup:port:forward: + desc: Set up port forwarding to MLflow service + internal: true + cmds: + - echo "Setting up port forwarding to MLflow service..." + - | SERVICE_NAME=$(kubectl get svc -n {{.NAMESPACE}} -l app.kubernetes.io/name=mlflow -o name | head -n 1) if [ -z "$SERVICE_NAME" ]; then echo "ERROR: Could not find MLflow service with label app.kubernetes.io/name=mlflow" @@ -608,7 +628,6 @@ tasks: echo "Setting up port forwarding to $SERVICE_NAME..." # Set up port forwarding in the background with logs echo "Setting up port forwarding using nohup..." - # Use nohup to ensure the process runs in the background even if the parent process exits PORT_FORWARD_LOG="/tmp/port-forward-mlflow-$$.log" nohup kubectl port-forward -n {{.NAMESPACE}} $SERVICE_NAME {{.PORT}}:5000 > $PORT_FORWARD_LOG 2>&1 & PORT_FORWARD_PID=$! @@ -646,6 +665,25 @@ tasks: echo "Port forwarding set up with PID: $PORT_FORWARD_PID" fi + # Save the PID to a file for other tasks to use + echo $PORT_FORWARD_PID > /tmp/mlflow-port-forward-main.pid + echo "✅ Port forwarding initialized." + + check:port:forward: + desc: Check if port forwarding is working and restart if needed + internal: true + cmds: + - echo "Checking port forwarding status..." + - | + # Read PID from file if it exists + PORT_FORWARD_LOG="/tmp/port-forward-mlflow-$$.log" + if [ -f "/tmp/mlflow-port-forward-main.pid" ]; then + PORT_FORWARD_PID=$(cat /tmp/mlflow-port-forward-main.pid) + else + echo "WARNING: No PID file found for port forwarding process" + PORT_FORWARD_PID="" + fi + # Give port-forward more time to establish echo "Waiting for port-forward to establish..." sleep 5 @@ -657,34 +695,43 @@ tasks: echo "WARNING: Port forwarding process with PID $PORT_FORWARD_PID is not running" echo "Port forwarding log:" cat $PORT_FORWARD_LOG || echo "No log file found" - echo "Will try to connect anyway..." - fi - fi - - # Check if port-forward is still running - if [ -n "$PORT_FORWARD_PID" ] && ! ps -p $PORT_FORWARD_PID > /dev/null 2>&1; then - echo "ERROR: Port forwarding process died during connection attempts." - echo "Port forwarding log:" - cat $PORT_FORWARD_LOG || echo "No log file found" - - # Restart port forwarding as a fallback - echo "Attempting to restart port forwarding..." - nohup kubectl port-forward -n {{.NAMESPACE}} $SERVICE_NAME {{.PORT}}:5000 > $PORT_FORWARD_LOG 2>&1 & - PORT_FORWARD_PID=$! - sleep 3 - - if [ -z "$PORT_FORWARD_PID" ] || [ "$PORT_FORWARD_PID" = "0" ]; then - echo "WARNING: Failed to capture restarted port-forward process PID" - echo "Will continue without checking process status" + echo "Will try to restart port forwarding..." + + # Get service name again + SERVICE_NAME=$(kubectl get svc -n {{.NAMESPACE}} -l app.kubernetes.io/name=mlflow -o name | head -n 1) + if [ -z "$SERVICE_NAME" ]; then + echo "ERROR: Could not find MLflow service to restart port forwarding" + echo "Will try to connect anyway..." + else + # Restart port forwarding + echo "Attempting to restart port forwarding..." + nohup kubectl port-forward -n {{.NAMESPACE}} $SERVICE_NAME {{.PORT}}:5000 > $PORT_FORWARD_LOG 2>&1 & + PORT_FORWARD_PID=$! + echo $PORT_FORWARD_PID > /tmp/mlflow-port-forward-main.pid + sleep 3 + + if [ -z "$PORT_FORWARD_PID" ] || [ "$PORT_FORWARD_PID" = "0" ]; then + echo "WARNING: Failed to capture restarted port-forward process PID" + echo "Will continue without checking process status" + else + echo "Restarted port forwarding with PID: $PORT_FORWARD_PID" + fi + + sleep 5 # Give the new port-forward time to establish + fi else - echo "Restarted port forwarding with PID: $PORT_FORWARD_PID" + echo "✅ Port forwarding process is running with PID: $PORT_FORWARD_PID" fi - - sleep 5 # Give the new port-forward time to establish fi - + echo "✅ Port forwarding check completed." + + test:connectivity: + desc: Test connectivity to MLflow service + internal: true + cmds: + - echo "Testing connectivity to MLflow on localhost:{{.PORT}}..." + - | # Basic connectivity check - echo "Checking connectivity to MLflow on localhost:{{.PORT}}..." MAX_CONN_RETRIES=5 CONN_RETRY_COUNT=0 CONN_SUCCESS=false @@ -695,18 +742,21 @@ tasks: # Try curling the MLflow endpoint if curl -s -o /dev/null -w "%{http_code}" http://localhost:{{.PORT}}/ > /dev/null 2>&1; then - echo "Successfully connected to MLflow service!" + echo "✅ Successfully connected to MLflow service!" CONN_SUCCESS=true break else echo "Connection attempt $CONN_RETRY_COUNT failed, retrying in 5 seconds..." # Check if port-forward is still running - if [ -n "$PORT_FORWARD_PID" ] && ! ps -p $PORT_FORWARD_PID > /dev/null 2>&1; then - echo "WARNING: Port forwarding process with PID $PORT_FORWARD_PID is not running" - echo "Port forwarding log:" - cat $PORT_FORWARD_LOG || echo "No log file found" - echo "Will try to connect anyway..." + if [ -f "/tmp/mlflow-port-forward-main.pid" ]; then + PORT_FORWARD_PID=$(cat /tmp/mlflow-port-forward-main.pid) + if [ -n "$PORT_FORWARD_PID" ] && ! ps -p $PORT_FORWARD_PID > /dev/null 2>&1; then + echo "WARNING: Port forwarding process with PID $PORT_FORWARD_PID is not running" + echo "Port forwarding log:" + cat /tmp/port-forward-mlflow-$$.log || echo "No log file found" + echo "Will try to connect anyway..." + fi fi sleep 5 @@ -716,14 +766,77 @@ tasks: if [ "$CONN_SUCCESS" != "true" ]; then echo "WARNING: Could not connect to MLflow service after $MAX_CONN_RETRIES attempts." echo "This may indicate issues with the service or port forwarding." - echo "Port forwarding log:" - cat $PORT_FORWARD_LOG + + if [ -f "/tmp/port-forward-mlflow-$$.log" ]; then + echo "Port forwarding log:" + cat /tmp/port-forward-mlflow-$$.log + fi + echo "Pod logs:" kubectl logs -n {{.NAMESPACE}} -l app.kubernetes.io/name=mlflow --tail=20 || true echo "Continuing anyway, but tests may fail." fi + echo "✅ Connectivity test completed." + + forward:port: + desc: Setup port forwarding to MLflow service for testing + internal: true + deps: [find:mlflow:service, wait:mlflow:pods] + cmds: + - echo "Setting up port forwarding to MLflow service..." + - task: setup:port:forward + - task: check:port:forward + - task: test:connectivity + - echo "✅ Port forwarding setup completed successfully." + + cleanup:port:forward: + desc: Clean up port forwarding processes + cmds: + - echo "Cleaning up port forwarding processes..." + - | + # Check if PID file exists + if [ -f "/tmp/mlflow-port-forward-main.pid" ]; then + PID=$(cat /tmp/mlflow-port-forward-main.pid) + + # Check if process is running and kill it + if [ -n "$PID" ] && ps -p $PID > /dev/null 2>&1; then + echo "Terminating port forwarding process with PID: $PID" + kill $PID + if [ $? -eq 0 ]; then + echo "✅ Port forwarding process terminated successfully" + else + echo "❌ Failed to terminate port forwarding process" + fi + else + echo "No running port forwarding process found with PID: $PID" + fi + + # Remove PID file + rm -f /tmp/mlflow-port-forward-main.pid + echo "Removed PID file" + else + echo "No port forwarding PID file found" + fi - echo "Port forwarding setup completed." + # Clean up any log files + rm -f /tmp/port-forward-mlflow-*.log + + # Check for any lingering port-forward processes + echo "Checking for lingering port-forward processes on port {{.PORT}}..." + RUNNING_FORWARDS=$(ps aux | grep "port-forward.*:{{.PORT}}" | grep -v grep || true) + if [ -n "$RUNNING_FORWARDS" ]; then + echo "Found running port-forward processes:" + echo "$RUNNING_FORWARDS" + echo "Attempting to kill these processes..." + + # Extract PIDs and kill + ps aux | grep "port-forward.*:{{.PORT}}" | grep -v grep | awk '{print $2}' | xargs -r kill + echo "✅ Lingering port-forward processes terminated" + else + echo "No lingering port-forward processes found" + fi + + echo "✅ Port forwarding cleanup completed" # Local installation task (renamed from test:install:local) install:helm:local: @@ -747,6 +860,9 @@ tasks: echo "No custom values file provided. Using default values." fi + # Clean up any existing port forwarding first + task cleanup:port:forward || true + # Install infra chart from local directory echo "Installing infra chart from local directory..." helm upgrade --install infra {{.CHART_DIR}}/infra \ @@ -794,11 +910,11 @@ tasks: deps: [test:install:helm, run:tests:app] cmds: - echo "All tests completed successfully" + - task: cleanup:port:forward # Documentation generation tasks docs:helm:generate: desc: Generate Helm chart documentation from templates - deps: [add:repos:helm, update:deps:helm] cmds: - echo "Generating Helm chart documentation..." - | @@ -818,7 +934,6 @@ tasks: docs:helm:check: desc: Check if Helm chart documentation is up to date - deps: [add:repos:helm, update:deps:helm] cmds: - echo "Checking if Helm chart documentation is up to date..." - | @@ -868,136 +983,10 @@ tasks: else echo "✅ All chart documentation is up to date." fi - - docs:kots:summary: - desc: Generate a summary of KOTS manifest files for platform engineers - cmds: - - echo "Generating KOTS manifest summary in docs/KOTS_MANIFEST_GUIDE.md..." - - | - # Create docs directory if it doesn't exist - mkdir -p docs - - # Generate the summary document - cat > docs/KOTS_MANIFEST_GUIDE.md << 'EOF' - # MLflow KOTS Manifest Guide - - This document provides a technical overview of the Replicated KOTS manifests used to package and deliver the MLflow application. - - ## Overview - - The manifests in the `applications/mlflow/kots` directory define how MLflow is packaged, configured, and deployed using the Replicated platform. - - ## Key Files and Their Purpose - - ### kots-app.yaml - - Defines the core application properties: - - Application metadata (name, icon, description) - - Status informers for monitoring deployment health - - Port definitions for services - - Release notes handling - - ### kots-config.yaml - - Contains all user-configurable settings presented during installation: - - Database configuration (embedded PostgreSQL or external) - - S3 storage settings (embedded MinIO or external S3) - - Networking and ingress configuration - - Resource allocation settings - - Each configuration option includes: - - Type (string, boolean, password, etc.) - - Default values - - Validation rules - - Help text for users - - Dependencies/when conditions - - ### mlflow-chart.yaml - - A HelmChart custom resource that: - - References the MLflow Helm chart - - Maps configuration values from user inputs to Helm values - - Defines conditional logic for different deployment scenarios - - Uses templating functions to insert configuration values - - ### infra-chart.yaml - - Similar to mlflow-chart.yaml but for infrastructure components: - - Configures supporting services (databases, object storage) - - Typically deployed before the main application - - ### kots-preflight.yaml - - Defines preflight checks to validate the environment before installation: - - Kubernetes version compatibility - - Resource availability (CPU, memory) - - Namespace access permissions - - Storage class availability - - ### k8s-app.yaml - - Kubernetes Application custom resource for discovery and management. - - ### ec.yaml - - EntitlementSpec that controls: - - License entitlements and limits - - Feature flags based on license tier - - Usage restrictions - - ## Best Practices for Modifications - - When making changes to these files: - - 1. **Version Consistency**: Update both Helm chart versions and kots-chart references - 2. **Testing**: Test with both new installations and upgrades - 3. **Config Options**: When adding new config options: - - Provide meaningful default values - - Include clear help text - - Consider when dependencies for conditional display - 4. **Templates**: Use consistent templating patterns - 5. **Preflight Checks**: Update preflight checks when requirements change - - ## Common Tasks - - ### Adding a New Configuration Option - - 1. Add the option to `kots-config.yaml` with appropriate metadata - 2. Reference the value in `mlflow-chart.yaml` using template functions - 3. Update preflight checks if the option affects requirements - - ### Updating Helm Chart Versions - - 1. Update the chartVersion in both infra-chart.yaml and mlflow-chart.yaml - 2. Run `task update:versions:chart` to ensure consistency - 3. Test both installation and upgrade scenarios - - ### Adding Support for a New Feature - - 1. First, implement the feature in the Helm chart - 2. Add any new configuration options to kots-config.yaml - 3. Connect the configuration to the Helm values in the chart files - 4. Optionally add preflight checks for any new requirements - 5. Test with both embedded and external dependencies - - ## Troubleshooting - - Common issues when working with these files: - - - **Template Rendering Errors**: Check template syntax in .yaml files - - **Preflight Check Failures**: Ensure requirements are correctly specified - - **Configuration Mismatches**: Verify config option names match between files - - **Upgrade Issues**: Test upgrades from earlier versions - - For more detailed information, refer to the [Replicated KOTS documentation](https://docs.replicated.com/vendor/packaging-kots-apps). - EOF - - echo "✅ KOTS manifest summary generated in docs/KOTS_MANIFEST_GUIDE.md" - # Version extraction extract:version:chart: desc: Extract and print the MLflow chart version cmds: - | echo "{{.MLFLOW_VERSION}}" - silent: true \ No newline at end of file + silent: true diff --git a/applications/mlflow/charts/mlflow/Chart.lock b/applications/mlflow/charts/mlflow/Chart.lock index 62851592..59484d3e 100644 --- a/applications/mlflow/charts/mlflow/Chart.lock +++ b/applications/mlflow/charts/mlflow/Chart.lock @@ -1,6 +1,6 @@ dependencies: - name: replicated repository: oci://registry.replicated.com/library - version: 1.5.0 -digest: sha256:47a29e041d280e6e5db79c0dcf469b5c43cef2d780169fa7cd40e9b02e9b1fd5 -generated: "2025-04-07T10:50:18.860452-04:00" + version: 1.5.1 +digest: sha256:743ca58f2dbfd1408d98b10e27b95f55f5dff2cfc3020e14c707822a5d0f88e0 +generated: "2025-04-16T12:26:22.509901-04:00" From 078d303c9b5b8939d4a70b40ddbd38d81fa0d8b2 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 12:57:21 -0400 Subject: [PATCH 11/18] use requirements --- .github/workflows/mlflow-ci.yml | 2 ++ applications/mlflow/Taskfile.yml | 4 ++-- applications/mlflow/tests/requirements.txt | 6 ++++++ 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 applications/mlflow/tests/requirements.txt diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 782715c0..e836ef76 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -39,6 +39,8 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.12 + cache: 'pip' + cache-dependency-path: applications/mlflow/tests/requirements.txt - name: Install Task uses: arduino/setup-task@v1 diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index ad1277b0..d3e9fd2d 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -895,8 +895,8 @@ tasks: cmds: - echo "Running application tests against MLflow on localhost:{{.PORT}}..." - | - echo "Installing Python dependencies for tests..." - pip3 install setuptools mlflow==2.11.0 pandas>=2.0.0 scikit-learn>=1.3.0 requests>=2.31.0 urllib3>=2.0.0 + echo "Installing Python dependencies from requirements.txt..." + pip3 install -r {{.TESTS_DIR}}/requirements.txt echo "Running MLflow application tests" python {{.TESTS_DIR}}/mlflow_test.py localhost:{{.PORT}} \ diff --git a/applications/mlflow/tests/requirements.txt b/applications/mlflow/tests/requirements.txt new file mode 100644 index 00000000..abd963ab --- /dev/null +++ b/applications/mlflow/tests/requirements.txt @@ -0,0 +1,6 @@ +setuptools>=65.0.0 +mlflow==2.11.0 +pandas>=2.0.0 +scikit-learn>=1.3.0 +requests>=2.31.0 +urllib3>=2.0.0 \ No newline at end of file From d2da8ffbb8b3626ae3944cb7761cee58d81d2452 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 13:02:31 -0400 Subject: [PATCH 12/18] use REPLICATED_APP env var --- .github/workflows/mlflow-ci.yml | 1 + applications/mlflow/Taskfile.yml | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index e836ef76..ccf6fe82 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -20,6 +20,7 @@ on: env: APP_SLUG: diamon-mlflow + REPLICATED_APP: diamon-mlflow jobs: lint-and-template: diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index d3e9fd2d..eb92836f 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -33,7 +33,8 @@ vars: sh: helm show chart ./charts/infra | grep '^version:' | cut -d ' ' -f 2 # Release configuration - APP_NAME: diamon-mlflow + # APP_NAME can be overridden by setting REPLICATED_APP environment variable + APP_NAME: '{{.REPLICATED_APP | default "diamon-mlflow"}}' YAML_DIR: "./kots" # Default task shows help From aff5441485db5ccdda000f62d41accd0d617fa6e Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 13:33:18 -0400 Subject: [PATCH 13/18] remove cache dependency path --- .github/workflows/mlflow-ci.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index ccf6fe82..2751355f 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -37,11 +37,10 @@ jobs: version: v3.13.3 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: 3.12 + python-version: '3.13' cache: 'pip' - cache-dependency-path: applications/mlflow/tests/requirements.txt - name: Install Task uses: arduino/setup-task@v1 @@ -234,9 +233,10 @@ jobs: version: v3.13.3 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: 3.12 + python-version: '3.13' + cache: 'pip' - name: Install Task uses: arduino/setup-task@v1 @@ -382,9 +382,10 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: 3.12 + python-version: '3.13.3' + cache: 'pip' - name: Install Task uses: arduino/setup-task@v1 From 6c88bb13ba045865a6657cb5935815c53f6fcc96 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 13:36:54 -0400 Subject: [PATCH 14/18] remove cache from pythong setup --- .github/workflows/mlflow-ci.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 2751355f..eda32d30 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -40,7 +40,6 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.13' - cache: 'pip' - name: Install Task uses: arduino/setup-task@v1 @@ -236,7 +235,6 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.13' - cache: 'pip' - name: Install Task uses: arduino/setup-task@v1 @@ -385,7 +383,6 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.13.3' - cache: 'pip' - name: Install Task uses: arduino/setup-task@v1 From a08b6d4fa349e2c18e67190e8acc100d92c9d239 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 14:25:56 -0400 Subject: [PATCH 15/18] revert using requirements --- .github/workflows/mlflow-ci.yml | 45 +++++++-- applications/mlflow/Taskfile.yml | 108 ++++++++++++++++++++- applications/mlflow/tests/requirements.txt | 9 +- 3 files changed, 149 insertions(+), 13 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index eda32d30..166ec8c2 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -37,9 +37,11 @@ jobs: version: v3.13.3 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v4 with: - python-version: '3.13' + python-version: 3.12 + cache: 'pip' + cache-dependency-path: applications/mlflow/tests/requirements.txt - name: Install Task uses: arduino/setup-task@v1 @@ -232,9 +234,11 @@ jobs: version: v3.13.3 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v4 with: - python-version: '3.13' + python-version: 3.12 + cache: 'pip' + cache-dependency-path: applications/mlflow/tests/requirements.txt - name: Install Task uses: arduino/setup-task@v1 @@ -326,7 +330,18 @@ jobs: - name: Run Application Tests working-directory: applications/mlflow run: | - # Run task to test application + # Create Python virtual environment + python -m venv ./venv + source ./venv/bin/activate + + # Install dependencies with fallback to binary-only + python -m pip install --upgrade pip wheel setuptools + python -m pip install --no-cache-dir -r tests/requirements.txt || { + echo "Regular installation failed, trying with binary-only approach..." + python -m pip install --only-binary=:all: -r tests/requirements.txt + } + + # Run tests task run:tests:app env: KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} @@ -380,9 +395,11 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v4 with: - python-version: '3.13.3' + python-version: 3.12 + cache: 'pip' + cache-dependency-path: applications/mlflow/tests/requirements.txt - name: Install Task uses: arduino/setup-task@v1 @@ -584,11 +601,21 @@ jobs: env: KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} - # Application testing with our consolidated test file - name: Run Application Tests working-directory: applications/mlflow run: | - # Run task to test application + # Create Python virtual environment + python -m venv ./venv + source ./venv/bin/activate + + # Install dependencies with fallback to binary-only + python -m pip install --upgrade pip wheel setuptools + python -m pip install --no-cache-dir -r tests/requirements.txt || { + echo "Regular installation failed, trying with binary-only approach..." + python -m pip install --only-binary=:all: -r tests/requirements.txt + } + + # Run tests task run:tests:app env: KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index eb92836f..7ac7b64e 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -896,8 +896,20 @@ tasks: cmds: - echo "Running application tests against MLflow on localhost:{{.PORT}}..." - | - echo "Installing Python dependencies from requirements.txt..." - pip3 install -r {{.TESTS_DIR}}/requirements.txt + # Check if running inside a virtual environment already + if [ -z "$VIRTUAL_ENV" ]; then + echo "Installing Python dependencies directly..." + # Try to use binary wheels whenever possible + python -m pip install --upgrade pip wheel setuptools + # Install the required packages directly + python -m pip install mlflow numpy pandas scikit-learn pytest requests || { + echo "❌ Failed to install dependencies. Trying with --only-binary approach..." + # Try installing packages that commonly have build issues with binary-only + python -m pip install --only-binary=numpy,pandas,pyarrow,scikit-learn mlflow numpy pandas scikit-learn pytest requests + } + else + echo "Running in virtual environment $VIRTUAL_ENV, skipping dependency installation" + fi echo "Running MLflow application tests" python {{.TESTS_DIR}}/mlflow_test.py localhost:{{.PORT}} \ @@ -913,6 +925,98 @@ tasks: - echo "All tests completed successfully" - task: cleanup:port:forward + # Alternative test task with venv + run:tests:app:venv: + desc: Run application tests using a virtual environment for better isolation + cmds: + - echo "Running application tests in a virtual environment..." + - | + # Set up test env directory + TEST_ENV_DIR="{{.TESTS_DIR}}/.venv" + + # Clean up any existing venv if requested + if [ "${CLEAN_VENV:-no}" = "yes" ]; then + echo "Cleaning up existing virtual environment..." + rm -rf "$TEST_ENV_DIR" + fi + + # Create virtual environment if it doesn't exist + if [ ! -d "$TEST_ENV_DIR" ]; then + echo "Setting up new virtual environment..." + python -m venv "$TEST_ENV_DIR" + fi + + # Determine the correct activation script based on shell + if [ -f "$TEST_ENV_DIR/bin/activate" ]; then + ACTIVATE_SCRIPT="$TEST_ENV_DIR/bin/activate" + elif [ -f "$TEST_ENV_DIR/Scripts/activate" ]; then + ACTIVATE_SCRIPT="$TEST_ENV_DIR/Scripts/activate" + else + echo "❌ Unable to find activation script for virtual environment" + exit 1 + fi + + # Create a temporary script to run in the activated environment + TMP_SCRIPT=$(mktemp) + cat > "$TMP_SCRIPT" << 'EOF' + set -e + echo "Using Python: $(which python)" + echo "Python version: $(python --version)" + echo "Upgrading pip, setuptools, and wheel..." + python -m pip install --upgrade pip setuptools wheel + + echo "Installing dependencies with retry..." + MAX_RETRIES=2 + RETRY_COUNT=0 + SUCCESS=false + + while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ "$SUCCESS" != "true" ]; do + RETRY_COUNT=$((RETRY_COUNT + 1)) + echo "Attempt $RETRY_COUNT/$MAX_RETRIES: Installing dependencies..." + + if python -m pip install --no-cache-dir -r ./tests/requirements.txt; then + SUCCESS=true + else + echo "Installation failed, trying with binary-only approach..." + if python -m pip install --only-binary=:all: -r ./tests/requirements.txt; then + SUCCESS=true + else + echo "Binary-only installation failed too." + if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then + echo "Will retry in 5 seconds..." + sleep 5 + fi + fi + fi + done + + if [ "$SUCCESS" != "true" ]; then + echo "❌ Failed to install dependencies after $MAX_RETRIES attempts" + exit 1 + fi + + echo "Running MLflow application tests" + python ./tests/mlflow_test.py localhost:5000 --protocol http --connection-timeout 180 --debug + EOF + + # Make the script executable + chmod +x "$TMP_SCRIPT" + + # Run the script within the activated environment + echo "Activating virtual environment and running tests..." + source "$ACTIVATE_SCRIPT" && bash "$TMP_SCRIPT" + + # Clean up + rm -f "$TMP_SCRIPT" + + # Task to clean the virtual environment + clean:venv: + desc: Clean up the Python virtual environment used for testing + cmds: + - echo "Cleaning up Python virtual environment..." + - rm -rf "{{.TESTS_DIR}}/.venv" + - echo "✅ Python virtual environment cleaned" + # Documentation generation tasks docs:helm:generate: desc: Generate Helm chart documentation from templates diff --git a/applications/mlflow/tests/requirements.txt b/applications/mlflow/tests/requirements.txt index abd963ab..f94fcea3 100644 --- a/applications/mlflow/tests/requirements.txt +++ b/applications/mlflow/tests/requirements.txt @@ -1,6 +1,11 @@ setuptools>=65.0.0 +wheel>=0.40.0 mlflow==2.11.0 -pandas>=2.0.0 -scikit-learn>=1.3.0 +# Pre-built wheels for problematic packages +numpy<2.0.0 +pandas<2.2.0 +scikit-learn<1.4.0 +# Pin pyarrow to a version with pre-built wheels +pyarrow==15.0.0 requests>=2.31.0 urllib3>=2.0.0 \ No newline at end of file From 1c96bb5086c63f9d1ec5cdb98b6b6960c5b51227 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 14:30:31 -0400 Subject: [PATCH 16/18] revert using requirements --- .github/workflows/mlflow-ci.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 166ec8c2..42dff3e9 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -39,9 +39,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.12 - cache: 'pip' - cache-dependency-path: applications/mlflow/tests/requirements.txt + python-version: 3.13 - name: Install Task uses: arduino/setup-task@v1 @@ -236,9 +234,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.12 - cache: 'pip' - cache-dependency-path: applications/mlflow/tests/requirements.txt + python-version: 3.13 - name: Install Task uses: arduino/setup-task@v1 @@ -397,9 +393,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.12 - cache: 'pip' - cache-dependency-path: applications/mlflow/tests/requirements.txt + python-version: 3.13 - name: Install Task uses: arduino/setup-task@v1 From 9342efe6a8d16c96579af10e4e5e1efbdf100020 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 14:53:52 -0400 Subject: [PATCH 17/18] fix dependency install --- applications/mlflow/Taskfile.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index 7ac7b64e..9195829e 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -901,11 +901,13 @@ tasks: echo "Installing Python dependencies directly..." # Try to use binary wheels whenever possible python -m pip install --upgrade pip wheel setuptools - # Install the required packages directly - python -m pip install mlflow numpy pandas scikit-learn pytest requests || { - echo "❌ Failed to install dependencies. Trying with --only-binary approach..." - # Try installing packages that commonly have build issues with binary-only - python -m pip install --only-binary=numpy,pandas,pyarrow,scikit-learn mlflow numpy pandas scikit-learn pytest requests + # First install pandas with only-binary to avoid compilation issues + python -m pip install --only-binary=pandas pandas + # Then install the rest of the packages + python -m pip install mlflow numpy scikit-learn pytest requests || { + echo "❌ Failed to install dependencies. Trying with --only-binary approach for all packages..." + # Try installing all packages that commonly have build issues with binary-only + python -m pip install --only-binary=:all: mlflow numpy pandas scikit-learn pytest requests } else echo "Running in virtual environment $VIRTUAL_ENV, skipping dependency installation" From d83a77d70405572ac803a6b2f679e1617615d9d4 Mon Sep 17 00:00:00 2001 From: Diamon Wiggins Date: Wed, 16 Apr 2025 15:12:21 -0400 Subject: [PATCH 18/18] fix dependency install --- .github/workflows/mlflow-ci.yml | 20 +++++++++----------- applications/mlflow/Taskfile.yml | 10 ++-------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 42dff3e9..fe23fc83 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -39,7 +39,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.13 + python-version: 3.12 - name: Install Task uses: arduino/setup-task@v1 @@ -330,12 +330,11 @@ jobs: python -m venv ./venv source ./venv/bin/activate - # Install dependencies with fallback to binary-only + # Install dependencies directly python -m pip install --upgrade pip wheel setuptools - python -m pip install --no-cache-dir -r tests/requirements.txt || { - echo "Regular installation failed, trying with binary-only approach..." - python -m pip install --only-binary=:all: -r tests/requirements.txt - } + + # Install required packages + python -m pip install mlflow numpy pandas scikit-learn pytest requests # Run tests task run:tests:app @@ -602,12 +601,11 @@ jobs: python -m venv ./venv source ./venv/bin/activate - # Install dependencies with fallback to binary-only + # Install dependencies directly python -m pip install --upgrade pip wheel setuptools - python -m pip install --no-cache-dir -r tests/requirements.txt || { - echo "Regular installation failed, trying with binary-only approach..." - python -m pip install --only-binary=:all: -r tests/requirements.txt - } + + # Install required packages + python -m pip install mlflow numpy pandas scikit-learn pytest requests # Run tests task run:tests:app diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index 9195829e..45aef30a 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -901,14 +901,8 @@ tasks: echo "Installing Python dependencies directly..." # Try to use binary wheels whenever possible python -m pip install --upgrade pip wheel setuptools - # First install pandas with only-binary to avoid compilation issues - python -m pip install --only-binary=pandas pandas - # Then install the rest of the packages - python -m pip install mlflow numpy scikit-learn pytest requests || { - echo "❌ Failed to install dependencies. Trying with --only-binary approach for all packages..." - # Try installing all packages that commonly have build issues with binary-only - python -m pip install --only-binary=:all: mlflow numpy pandas scikit-learn pytest requests - } + # Install the required packages directly + python -m pip install mlflow numpy pandas scikit-learn pytest requests else echo "Running in virtual environment $VIRTUAL_ENV, skipping dependency installation" fi