marqo-ai · vicilliar · Nov 7, 2024 · Dec 6, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/.github/workflows/unit_test_200gb_CI.yml b/.github/workflows/unit_test_200gb_CI.yml
@@ -1,9 +1,36 @@
 name: unit_test_200gb_CI
+run-name: Unit Tests with ${{ inputs.number_of_shards || 1 }} shards and ${{ inputs.number_of_replicas || 0 }} replicas
 # runs unit tests on AMD64 machine
 
 on:
   workflow_call:
+    inputs:
+      number_of_shards:
+        type: number
+        description: 'Number of shards (content nodes per group in Vespa). Minimum of 1.'
+        required: true
+        default: 1
+
+      number_of_replicas:
+        type: number
+        description: 'Number of replicas (groups in Vespa minus 1). Minimum of 0.'
+        required: true
+        default: 0
+
   workflow_dispatch:
+    inputs:
+      number_of_shards:
+        type: number
+        description: 'Number of shards (content nodes per group in Vespa). Minimum of 1.'
+        required: true
+        default: 1
+
+      number_of_replicas:
+        type: number
+        description: 'Number of replicas (groups in Vespa - 1)'
+        required: true
+        default: 0
+
   push:
     branches:
       - mainline
@@ -16,13 +43,45 @@ on:
       - releases/*
 
 concurrency:
-  group: unit-tests-${{ github.ref }}
+  group: unit-tests-${{ github.ref }}-${{ inputs.number_of_shards }}-${{ inputs.number_of_replicas }}
   cancel-in-progress: true
 
 permissions:
   contents: read
 
 jobs:
+  Determine-Vespa-Setup:
+    runs-on: ubuntu-latest
+    outputs:
+      VESPA_MULTINODE_SETUP: ${{ steps.set_var.outputs.VESPA_MULTINODE_SETUP }}
+      MULTINODE_TEST_ARGS: ${{ steps.set_var.outputs.MULTINODE_TEST_ARGS }}
+    steps:
+      - name: Determine VESPA_MULTINODE_SETUP
+        id: set_var
+        run: |
+          # Initialize as false
+          echo "VESPA_MULTINODE_SETUP=false" >> $GITHUB_OUTPUT
+          echo "MULTINODE_TEST_ARGS=" >> $GITHUB_OUTPUT
+
+          # Check if the event is workflow_call or workflow_dispatch
+          if [[ "${{ github.event_name }}" == "workflow_call" || "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            # Extract inputs safely, defaulting to 1 (for shards), 0 (for replicas) if not present
+            NUMBER_OF_SHARDS="${{ inputs.number_of_shards || 1 }}"
+            NUMBER_OF_REPLICAS="${{ inputs.number_of_replicas || 0 }}"
+
+            # Convert inputs to integers
+            NUMBER_OF_SHARDS_INT=$(echo "$NUMBER_OF_SHARDS" | awk '{print int($0)}')
+            NUMBER_OF_REPLICAS_INT=$(echo "$NUMBER_OF_REPLICAS" | awk '{print int($0)}')
+
+            # Evaluate the conditions
+            if [[ "$NUMBER_OF_SHARDS_INT" -gt 1 || "$NUMBER_OF_REPLICAS_INT" -gt 0 ]]; then
+              echo "Using multi-node Vespa setup. Shards are $NUMBER_OF_SHARDS_INT and replicas are $NUMBER_OF_REPLICAS_INT."
+              echo "VESPA_MULTINODE_SETUP=true" >> $GITHUB_OUTPUT
+              # If multinode vespa, ignore unrelated tests to save time
+              echo "MULTINODE_TEST_ARGS=--multinode --ignore=tests/core/index_management/test_index_management.py --ignore=tests/core/inference --ignore=tests/processing --ignore=tests/s2_inference" >> $GITHUB_OUTPUT
+            fi
+          fi
+
   Check-Changes:
     runs-on: ubuntu-latest
     outputs:
@@ -41,7 +100,7 @@ jobs:
           set -x
 
           # Determine BASE_COMMIT and HEAD_COMMIT based on the event type
-          if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then
+          if [[ "${GITHUB_EVENT_NAME}" == "pull_request"  || "${GITHUB_EVENT_NAME}" == "pull_request_review" ]]; then
             BASE_COMMIT=${{ github.event.pull_request.base.sha }}
             HEAD_COMMIT=${{ github.event.pull_request.head.sha }}
           elif [[ "${GITHUB_EVENT_NAME}" == "push" ]]; then
@@ -71,10 +130,11 @@ jobs:
           fi
 
   Start-Runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
     needs:
+      - Determine-Vespa-Setup
       - Check-Changes
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
     if: ${{ needs.Check-Changes.outputs.doc_only == 'false' }} # Run only if there are non-documentation changes
     outputs:
       label: ${{ steps.start-ec2-runner.outputs.label }}
@@ -93,7 +153,8 @@ jobs:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           ec2-image-id: ${{ vars.MARQO_CPU_AMD64_TESTS_INSTANCE_AMI }}
-          ec2-instance-type: m6i.xlarge
+          # m6i.xlarge if single node vespa, but m6i.2xlarge if multinode vespa
+          ec2-instance-type: ${{ needs.Determine-Vespa-Setup.outputs.VESPA_MULTINODE_SETUP == 'true' && 'm6i.2xlarge' || 'm6i.xlarge' }}
           subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }}
           security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }}
           aws-resource-tags: > # optional, requires additional permissions
@@ -111,9 +172,13 @@ jobs:
     needs:
       - Check-Changes # required to start the main job when the runner is ready
       - Start-Runner # required to get output from the start-runner job
+      - Determine-Vespa-Setup
     if: ${{ needs.Check-Changes.outputs.doc_only == 'false' }} # Run only if there are non-documentation changes
     runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
     environment: marqo-test-suite
+    env:
+      VESPA_MULTINODE_SETUP: ${{ needs.Determine-Vespa-Setup.outputs.VESPA_MULTINODE_SETUP }}
+      MULTINODE_TEST_ARGS: ${{ needs.Determine-Vespa-Setup.outputs.MULTINODE_TEST_ARGS }}
     steps:
       - name: Checkout marqo repo
         uses: actions/checkout@v3
@@ -177,10 +242,10 @@ jobs:
           export VESPA_CONFIG_URL=http://localhost:19071
           export VESPA_DOCUMENT_URL=http://localhost:8080
           export VESPA_QUERY_URL=http://localhost:8080
-
+          
           cd marqo/scripts/vespa_local
           set -x
-          python vespa_local.py start
+          START_SCRIPT_OUTPUT=$(python vespa_local.py start --Shards ${{ inputs.number_of_shards || 1 }} --Replicas ${{ inputs.number_of_replicas || 0 }})
           set +x
 
           echo "Waiting for Vespa to start"
@@ -190,22 +255,15 @@ jobs:
           done
           echo -e "\nDone waiting."
 
-          # Zip up schemas and services
+          # Zip up schemas, hosts, and services
+          # Deploy application with test schema, don't keep zip file
           sudo apt-get install zip -y
-          zip -r vespa_tester_app.zip services.xml schemas
-
-          # Deploy application with test schema
-          curl --header "Content-Type:application/zip" --data-binary @vespa_tester_app.zip http://localhost:19071/application/v2/tenant/default/prepareandactivate
-
-          # wait for vespa to start (document url):
-          timeout 10m bash -c 'until curl -f -X GET $VESPA_DOCUMENT_URL >/dev/null 2>&1; do echo "  Waiting for Vespa document API to be available..."; sleep 10; done;' || \
-            (echo "Vespa (Document URL) did not start in time" && exit 1)
-
-          echo "Vespa document API is available. Local Vespa setup complete."
-
-          # Delete the zip file
-          rm vespa_tester_app.zip
-          echo "Deleted vespa_tester_app.zip"
+          zip -r - . -x README.md .env .gitignore "*.yml" __init__.py | curl --header "Content-Type:application/zip" --data-binary @- http://localhost:19071/application/v2/tenant/default/prepareandactivate
+
+          # Check for config node convergence
+          timeout 10m bash -c 'until [ "$(curl -f -X GET $VESPA_CONFIG_URL/application/v2/tenant/default/application/default/environment/prod/region/default/instance/default/serviceconverge | jq -r ".converged")" = "true" ]; do echo "  Waiting for Vespa convergence to be true..."; sleep 10; done;' || \
+            (echo "Vespa did not converge in time" && exit 1)
+          echo "Vespa application has converged. Vespa setup complete!"
 
       - name: Run Unit Tests
         id: run_unit_tests
@@ -224,7 +282,7 @@ jobs:
           cd marqo
           export PYTHONPATH="./tests:./src:."
           set -o pipefail
-          pytest --ignore=tests/test_documentation.py --ignore=tests/compatibility_tests \
+          pytest ${{ env.MULTINODE_TEST_ARGS }} --ignore=tests/test_documentation.py --ignore=tests/compatibility_tests \
             --durations=100 --cov=src --cov-branch --cov-context=test --cov-fail-under=69 \
             --cov-report=html:cov_html --cov-report=xml:cov.xml --cov-report term:skip-covered \
             --md-report --md-report-flavor gfm --md-report-output pytest_result_summary.md \

diff --git a/.github/workflows/unit_tests_with_shards_and_replicas.yml b/.github/workflows/unit_tests_with_shards_and_replicas.yml
@@ -0,0 +1,52 @@
+# Runs unit tests on 4 cases:
+# 1. single node vespa
+# 2. multinode vespa: 1 shard, 1 replica
+# 3. multinode vespa: 2 shard, 0 replicas
+# 4. multinode vespa: 2 shards, 1 replicas
+# Runs only once on PR approval
+
+name: Unit Tests with Shards and Replicas
+
+on:
+  workflow_dispatch:
+  pull_request_review:
+    types: [submitted]
+    branches:
+      - mainline
+      - 'releases/*'
+
+permissions:
+  contents: read
+
+jobs:
+  Unit-Tests-1-Shard-0-Replica:
+    uses: ./.github/workflows/unit_test_200gb_CI.yml
+    secrets: inherit
+    if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved'
+    with:
+        number_of_shards: 1
+        number_of_replicas: 0
+
+  Unit-Tests-1-Shard-1-Replica:
+    uses: ./.github/workflows/unit_test_200gb_CI.yml
+    secrets: inherit
+    if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved'
+    with:
+      number_of_shards: 1
+      number_of_replicas: 1
+
+  Unit-Tests-2-Shard-0-Replica:
+    uses: ./.github/workflows/unit_test_200gb_CI.yml
+    secrets: inherit
+    if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved'
+    with:
+      number_of_shards: 2
+      number_of_replicas: 0
+
+  Unit-Tests-2-Shard-1-Replica:
+    uses: ./.github/workflows/unit_test_200gb_CI.yml
+    secrets: inherit
+    if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved'
+    with:
+      number_of_shards: 2
+      number_of_replicas: 1
diff --git a/.gitignore b/.gitignore
@@ -149,5 +149,15 @@ dump.rdb
 
 .DS_Store
 
+# Local vespa artifacts
 # Tester app for unit tests
-scripts/vespa_local/vespa_tester_app.zip
+scripts/vespa_local/vespa_tester_app.zip
+
+# Dynamically generated files for multinode vespa
+scripts/vespa_local/docker-compose.yml
+scripts/vespa_local/services.xml
+scripts/vespa_local/hosts.xml
+
+scripts/vespa_local/multinode/docker-compose.yml
+scripts/vespa_local/multinode/services.xml
+scripts/vespa_local/multinode/hosts.xml
diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -4,4 +4,8 @@ pytest==8.3.4
 pytest-cov==6.0.0
 diff-cover==9.2.0
 pytest-md-report==0.6.2
-pytest-asyncio==0.23.8
+pytest-asyncio==0.23.8
+
+# For vespa_local setup
+docker==7.1.0
+PyYAML==6.0.2
diff --git a/scripts/__init__.py b/scripts/__init__.py
diff --git a/scripts/vespa_local/README.md b/scripts/vespa_local/README.md
@@ -0,0 +1,58 @@
+# Setting up Vespa locally
+When running Marqo or the unit test suite locally, a Vespa node or cluster needs to be running. To assist with this, 
+this directory comes with scripts to set up either a single node (1 container) or multinode-HA Vespa on your machine.
+
+### Set Vespa version
+- By default, this script will use Vespa 8.431.32, as defined in `vespa_local.py`. To change it, set the `VESPA_VERSION`
+variable to the desired version. For example:
+```commandline
+export VESPA_VERSION="latest"
+```
+## Single Node Vespa (default & recommended)
+- Runs 1 Vespa container on your machine. This serves as the config, api, and content node.
+- This is equivalent to running Vespa with 0 replicas and 1 shard.
+- Start with this command:
+```commandline
+python vespa_local.py start
+```
+- This will run the Vespa docker container then copy the `services.xml` file from the `singlenode/` directory to 
+this directory. This will be bundled into the Vespa application upon deployment.
+
+## Multi-node Vespa
+- Runs a Vespa cluster with the following nodes:
+  - 3 config nodes
+  - `m` content nodes, where `m` is `number_of_shards * (1 + number_of_replicas)`
+  - `n` API nodes, where `n` is `max(2, number_of_content_nodes)`
+- For example, with 2 shards and 1 replica, it will run 4 content nodes and 2 API nodes.
+- Start with this command:
+```commandline
+python vespa_local.py start --Shards 2 --Replicas 1
+```
+
+## Deployment
+- After starting the Vespa node(s), you can deploy the Vespa application with the files in this directory using:
+```commandline
+python vespa_local.py deploy-config
+```
+- For single node, you can check for readiness using:
+```
+curl -s http://localhost:19071/state/v1/health
+```
+- For multi-node, the start script will output a list of URLs corresponding to the API and content nodes.
+You can curl each one to check for readiness.
+
+## Other Commands
+### Stop Vespa
+```commandline
+python vespa_local.py stop
+```
+### Restart Vespa
+```commandline
+python vespa_local.py restart
+```
+
+## Notes
+- When running other commands in this script (stop, restart), it will check for the presence of a container named 
+`vespa`, and will assume setup is single node if it finds one. If not, it will assume setup is multi-node.
+- For multi-node, expect config and API nodes to take ~1gb of memory, while content nodes take ~500mb each. Adjust your
+resource allotment accordingly.
diff --git a/scripts/vespa_local/__init__.py b/scripts/vespa_local/__init__.py
diff --git a/scripts/vespa_local/services.xml b/scripts/vespa_local/services.xml
@@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
+<!-- This is the services.xml for a single node Vespa setup -->
 <services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties">
     <container id="default" version="1.0">
         <document-api/>

diff --git a/scripts/vespa_local/singlenode/services.xml b/scripts/vespa_local/singlenode/services.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
+<!-- This is the services.xml for a single node Vespa setup -->
+<services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties">
+    <container id="default" version="1.0">
+        <document-api/>
+        <search/>
+        <nodes>
+            <jvm options="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" />
+            <node hostalias="node1"/>
+        </nodes>
+    </container>
+    <content id="content_default" version="1.0">
+        <redundancy>2</redundancy>
+        <documents>
+            <document type="test_vespa_client" mode="index"/>
+        </documents>
+        <nodes>
+            <node hostalias="node1" distribution-key="0"/>
+        </nodes>
+    </content>
+
+</services>