.github/workflows/largemodel_unit_test_CI.yml

name: largemodel_unit_test_CI
# runs unit tests on CUDA machine with large model tests.

on:
  workflow_call:
  workflow_dispatch:
  push:
    branches:
      - mainline
      - releases/*
    paths-ignore:
      - '**.md'
  pull_request:
    branches:
      - mainline
      - releases/*
    paths-ignore:
      - '**.md'

concurrency:
  group: large-model-unit-tests-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read

jobs:
  Start-Runner:
    name: Start self-hosted EC2 runner
    runs-on: ubuntu-latest
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }}
          aws-region: us-east-1
      - name: Start EC2 runner
        id: start-ec2-runner
        uses: machulav/ec2-github-runner@v2
        with:
          mode: start
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          ec2-image-id: ${{ vars.MARQO_CUDA_TESTS_INSTANCE_AMI }}
          ec2-instance-type: g4dn.2xlarge
          subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }}
          security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }}
          aws-resource-tags: > # optional, requires additional permissions
            [
              {"Key": "Name", "Value": "marqo-github-runner-${{ github.run_id }}"},
              {"Key": "GitHubRepo", "Value": "${{ github.repository }}"},
              {"Key": "WorkflowName", "Value": "${{ github.workflow }}"},
              {"Key": "WorkflowRunId", "Value": "${{ github.run_id }}"},
              {"Key": "WorlflowURL", "Value": "${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}"},
              {"Key": "PoloRole", "Value": "testing"}
            ]

  Test-Marqo:
    name: Run Large Model Unit Tests
    needs: Start-Runner # required to start the main job when the runner is ready
    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    environment: marqo-test-suite
    steps:
      - name: Checkout marqo repo
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
          path: marqo

      - name: Set up Python 3.9
        uses: actions/setup-python@v3
        with:
          python-version: "3.9"
          cache: "pip"

      - name: Checkout marqo-base for requirements
        uses: actions/checkout@v3
        with:
          repository: marqo-ai/marqo-base
          path: marqo-base

      - name: Install dependencies
        run: |
          pip install -r marqo-base/requirements/amd64-gpu-requirements.txt
          # override base requirements with marqo requirements, if needed: 
          pip install -r marqo/requirements.dev.txt
          pip install pytest==7.4.0

      - name: Download nltk data
        run: |
            python -m nltk.downloader punkt_tab

      - name: Build Vespa
        run: |
          systemctl stop unattended-upgrades
          apt-get remove -y unattended-upgrades
          
          echo "Updating package list"
          apt-get update -y
          
          # Build Vespa components
          echo "Installing jdk 17"
          sudo apt-get install openjdk-17-jdk -y
          export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
          export PATH=$JAVA_HOME/bin:$PATH
          echo "Installing maven"
          sudo apt-get install maven -y
          echo "Building Vespa components"
          cd marqo/vespa
          mvn clean package

      - name: Start Vespa
        run: |
          # Define these for checking if Vespa is ready
          export VESPA_CONFIG_URL=http://localhost:19071
          export VESPA_DOCUMENT_URL=http://localhost:8080
          export VESPA_QUERY_URL=http://localhost:8080
          
          
          cd marqo/scripts/vespa_local
          set -x
          python vespa_local.py start
          set +x

          echo "Waiting for Vespa to start"
          for i in {1..20}; do
              echo -ne "Waiting... $i seconds\r"
              sleep 1
          done
          echo -e "\nDone waiting."
          
          # Zip up schemas and services
          sudo apt-get install zip -y
          zip -r vespa_tester_app.zip services.xml schemas

          # Deploy application with test schema
          curl --header "Content-Type:application/zip" --data-binary @vespa_tester_app.zip http://localhost:19071/application/v2/tenant/default/prepareandactivate

          # wait for vespa to start (document url):
          timeout 10m bash -c 'until curl -f -X GET $VESPA_DOCUMENT_URL >/dev/null 2>&1; do echo "  Waiting for Vespa document API to be available..."; sleep 10; done;' || \
            (echo "Vespa (Document URL) did not start in time" && exit 1)
          
          echo "Vespa document API is available. Local Vespa setup complete."

          # Delete the zip file
          rm vespa_tester_app.zip
          echo "Deleted vespa_tester_app.zip"

      - name: Run Large Model Unit Tests
        id: run_unit_tests
        continue-on-error: true
        run: |
          # Define these for use by marqo
          export VESPA_CONFIG_URL=http://localhost:19071
          export VESPA_DOCUMENT_URL=http://localhost:8080
          export VESPA_QUERY_URL=http://localhost:8080
          export MARQO_MAX_CPU_MODEL_MEMORY=15
          export MARQO_MAX_CUDA_MODEL_MEMORY=15
          
          export PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID=${{ secrets.PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID }}
          export PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY=${{ secrets.PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY }}
          export PRIVATE_MODEL_TESTS_HF_TOKEN=${{ secrets.PRIVATE_MODEL_TESTS_HF_TOKEN }}
          
          cd marqo
          export PYTHONPATH="./tests:./src:."
          set -o pipefail
          pytest --largemodel --ignore=tests/test_documentation.py --ignore=tests/backwards_compatibility_tests \
            --durations=100 --cov=src --cov-branch --cov-context=test \
            --cov-report=html:cov_html --cov-report=xml:cov.xml --cov-report term:skip-covered \
            --md-report --md-report-flavor gfm --md-report-output pytest_result_summary.md \
            tests | tee pytest_output.txt

      - name: Check Test Coverage of New Code
        id: check_test_coverage
        continue-on-error: true
        run: |
          if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then
            export BASE_BRANCH="${{ github.event.pull_request.base.ref }}"
          else
            export BASE_BRANCH=mainline
          fi
          
          cd marqo
          echo "Running diff-cover against branch $BASE_BRANCH"
          git fetch origin $BASE_BRANCH:$BASE_BRANCH
          diff-cover cov.xml --html-report diff_cov.html --markdown-report diff_cov.md \
            --compare-branch $BASE_BRANCH

      - name: Upload Test Report
        uses: actions/upload-artifact@v4
        continue-on-error: true
        with:
          name: marqo-test-report
          path: |
            marqo/cov_html/
            marqo/diff_cov.html

      - name: Test and coverage report summary
        continue-on-error: true
        run: |
          echo "# Test Summary" >> $GITHUB_STEP_SUMMARY
          cat marqo/pytest_result_summary.md >> $GITHUB_STEP_SUMMARY
          
          echo "# Coverage Summary, Slow Tests and Failed Tests" >> $GITHUB_STEP_SUMMARY
          echo '```text' >> $GITHUB_STEP_SUMMARY
          awk '/---------- coverage:/ {flag=1} flag' marqo/pytest_output.txt >> $GITHUB_STEP_SUMMARY
          echo '```' >> $GITHUB_STEP_SUMMARY
          
          cat marqo/diff_cov.md >> $GITHUB_STEP_SUMMARY

      - name: Fail Job If Tests or Coverage Failed
        if: ${{ steps.run_unit_tests.outcome == 'failure' || steps.check_test_coverage.outcome == 'failure' }}
        run: |
          echo "Tests or coverage checks failed. Marking job as failed."
          exit 1
        shell: bash

  Stop-Runner:
    name: Stop self-hosted EC2 runner
    needs:
      - Start-Runner # required to get output from the start-runner job
      - Test-Marqo # required to wait when the main job is done
    runs-on: ubuntu-latest
    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }}
          aws-region: us-east-1
      - name: Stop EC2 runner
        uses: machulav/ec2-github-runner@v2
        with:
          mode: stop
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          label: ${{ needs.start-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}