-
Notifications
You must be signed in to change notification settings - Fork 197
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Multinode-HA Vespa Setup for Local Testing #1071
base: mainline
Are you sure you want to change the base?
Changes from 37 commits
37074d3
d14c15a
22327df
9ef33c0
10a5182
9af624c
7ef0df3
7f7f7fe
32788d5
aaab2ea
ecf8f95
5567d8d
95af9bf
609dbc4
568db72
6a54d7d
cb28827
204a07d
fc0d936
ccb7247
2153979
47240d0
66aa53c
e07d9c8
4951dad
b40da81
c53eb5c
e158816
12800f8
036e0cd
05384c0
4c3b44a
4f49b62
9b4fb2e
21a3c38
a8cf14a
d8cd756
05f356e
b0c9546
86b4868
37afe2b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,36 @@ | ||
name: unit_test_200gb_CI | ||
run-name: Unit Tests with ${{ inputs.number_of_shards || 1 }} shards and ${{ inputs.number_of_replicas || 0 }} replicas | ||
# runs unit tests on AMD64 machine | ||
|
||
on: | ||
workflow_call: | ||
inputs: | ||
number_of_shards: | ||
type: number | ||
description: 'Number of shards (content nodes per group in Vespa). Minimum of 1.' | ||
required: true | ||
default: 1 | ||
|
||
number_of_replicas: | ||
type: number | ||
description: 'Number of replicas (groups in Vespa minus 1). Minimum of 0.' | ||
required: true | ||
default: 0 | ||
|
||
workflow_dispatch: | ||
inputs: | ||
number_of_shards: | ||
type: number | ||
description: 'Number of shards (content nodes per group in Vespa). Minimum of 1.' | ||
required: true | ||
default: 1 | ||
|
||
number_of_replicas: | ||
type: number | ||
description: 'Number of replicas (groups in Vespa - 1)' | ||
required: true | ||
default: 0 | ||
|
||
push: | ||
branches: | ||
- mainline | ||
|
@@ -16,13 +43,45 @@ on: | |
- releases/* | ||
|
||
concurrency: | ||
group: unit-tests-${{ github.ref }} | ||
group: unit-tests-${{ github.ref }}-${{ inputs.number_of_shards }}-${{ inputs.number_of_replicas }} | ||
cancel-in-progress: true | ||
|
||
permissions: | ||
contents: read | ||
|
||
jobs: | ||
Determine-Vespa-Setup: | ||
runs-on: ubuntu-latest | ||
outputs: | ||
VESPA_MULTINODE_SETUP: ${{ steps.set_var.outputs.VESPA_MULTINODE_SETUP }} | ||
MULTINODE_TEST_ARGS: ${{ steps.set_var.outputs.MULTINODE_TEST_ARGS }} | ||
steps: | ||
- name: Determine VESPA_MULTINODE_SETUP | ||
id: set_var | ||
run: | | ||
# Initialize as false | ||
echo "VESPA_MULTINODE_SETUP=false" >> $GITHUB_OUTPUT | ||
echo "MULTINODE_TEST_ARGS=" >> $GITHUB_OUTPUT | ||
|
||
# Check if the event is workflow_call or workflow_dispatch | ||
if [[ "${{ github.event_name }}" == "workflow_call" || "${{ github.event_name }}" == "workflow_dispatch" ]]; then | ||
# Extract inputs safely, defaulting to 1 (for shards), 0 (for replicas) if not present | ||
NUMBER_OF_SHARDS="${{ inputs.number_of_shards || 1 }}" | ||
NUMBER_OF_REPLICAS="${{ inputs.number_of_replicas || 0 }}" | ||
|
||
# Convert inputs to integers | ||
NUMBER_OF_SHARDS_INT=$(echo "$NUMBER_OF_SHARDS" | awk '{print int($0)}') | ||
NUMBER_OF_REPLICAS_INT=$(echo "$NUMBER_OF_REPLICAS" | awk '{print int($0)}') | ||
|
||
# Evaluate the conditions | ||
if [[ "$NUMBER_OF_SHARDS_INT" -gt 1 || "$NUMBER_OF_REPLICAS_INT" -gt 0 ]]; then | ||
echo "Using multi-node Vespa setup. Shards are $NUMBER_OF_SHARDS_INT and replicas are $NUMBER_OF_REPLICAS_INT." | ||
echo "VESPA_MULTINODE_SETUP=true" >> $GITHUB_OUTPUT | ||
# If multinode vespa, ignore unrelated tests to save time | ||
echo "MULTINODE_TEST_ARGS=--multinode --ignore=tests/core/index_management/test_index_management.py --ignore=tests/core/inference --ignore=tests/processing --ignore=tests/s2_inference" >> $GITHUB_OUTPUT | ||
papa99do marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fi | ||
fi | ||
|
||
Check-Changes: | ||
runs-on: ubuntu-latest | ||
outputs: | ||
|
@@ -71,10 +130,11 @@ jobs: | |
fi | ||
|
||
Start-Runner: | ||
name: Start self-hosted EC2 runner | ||
runs-on: ubuntu-latest | ||
needs: | ||
- Determine-Vespa-Setup | ||
- Check-Changes | ||
name: Start self-hosted EC2 runner | ||
runs-on: ubuntu-latest | ||
if: ${{ needs.Check-Changes.outputs.doc_only == 'false' }} # Run only if there are non-documentation changes | ||
outputs: | ||
label: ${{ steps.start-ec2-runner.outputs.label }} | ||
|
@@ -93,7 +153,8 @@ jobs: | |
mode: start | ||
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | ||
ec2-image-id: ${{ vars.MARQO_CPU_AMD64_TESTS_INSTANCE_AMI }} | ||
ec2-instance-type: m6i.xlarge | ||
# m6i.xlarge if single node vespa, but m6i.2xlarge if multinode vespa | ||
ec2-instance-type: ${{ needs.Determine-Vespa-Setup.outputs.VESPA_MULTINODE_SETUP == 'true' && 'm6i.2xlarge' || 'm6i.xlarge' }} | ||
subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} | ||
security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} | ||
aws-resource-tags: > # optional, requires additional permissions | ||
|
@@ -111,9 +172,13 @@ jobs: | |
needs: | ||
- Check-Changes # required to start the main job when the runner is ready | ||
- Start-Runner # required to get output from the start-runner job | ||
- Determine-Vespa-Setup | ||
if: ${{ needs.Check-Changes.outputs.doc_only == 'false' }} # Run only if there are non-documentation changes | ||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | ||
environment: marqo-test-suite | ||
env: | ||
VESPA_MULTINODE_SETUP: ${{ needs.Determine-Vespa-Setup.outputs.VESPA_MULTINODE_SETUP }} | ||
MULTINODE_TEST_ARGS: ${{ needs.Determine-Vespa-Setup.outputs.MULTINODE_TEST_ARGS }} | ||
steps: | ||
- name: Checkout marqo repo | ||
uses: actions/checkout@v3 | ||
|
@@ -177,10 +242,10 @@ jobs: | |
export VESPA_CONFIG_URL=http://localhost:19071 | ||
export VESPA_DOCUMENT_URL=http://localhost:8080 | ||
export VESPA_QUERY_URL=http://localhost:8080 | ||
|
||
cd marqo/scripts/vespa_local | ||
set -x | ||
python vespa_local.py start | ||
START_SCRIPT_OUTPUT=$(python vespa_local.py start --Shards ${{ inputs.number_of_shards || 1 }} --Replicas ${{ inputs.number_of_replicas || 0 }}) | ||
set +x | ||
|
||
echo "Waiting for Vespa to start" | ||
|
@@ -190,22 +255,15 @@ jobs: | |
done | ||
echo -e "\nDone waiting." | ||
|
||
# Zip up schemas and services | ||
# Zip up schemas, hosts, and services | ||
# Deploy application with test schema, don't keep zip file | ||
sudo apt-get install zip -y | ||
zip -r vespa_tester_app.zip services.xml schemas | ||
|
||
# Deploy application with test schema | ||
curl --header "Content-Type:application/zip" --data-binary @vespa_tester_app.zip http://localhost:19071/application/v2/tenant/default/prepareandactivate | ||
|
||
# wait for vespa to start (document url): | ||
timeout 10m bash -c 'until curl -f -X GET $VESPA_DOCUMENT_URL >/dev/null 2>&1; do echo " Waiting for Vespa document API to be available..."; sleep 10; done;' || \ | ||
(echo "Vespa (Document URL) did not start in time" && exit 1) | ||
|
||
echo "Vespa document API is available. Local Vespa setup complete." | ||
|
||
# Delete the zip file | ||
rm vespa_tester_app.zip | ||
echo "Deleted vespa_tester_app.zip" | ||
zip -r - . -x README.md .env .gitignore "*.yml" __init__.py | curl --header "Content-Type:application/zip" --data-binary @- http://localhost:19071/application/v2/tenant/default/prepareandactivate | ||
|
||
# Check for config node convergence | ||
timeout 10m bash -c 'until [ "$(curl -f -X GET $VESPA_CONFIG_URL/application/v2/tenant/default/application/default/environment/prod/region/default/instance/default/serviceconverge | jq -r ".converged")" = "true" ]; do echo " Waiting for Vespa convergence to be true..."; sleep 10; done;' || \ | ||
(echo "Vespa did not converge in time" && exit 1) | ||
echo "Vespa application has converged. Vespa setup complete!" | ||
|
||
- name: Run Unit Tests | ||
id: run_unit_tests | ||
|
@@ -224,7 +282,7 @@ jobs: | |
cd marqo | ||
export PYTHONPATH="./tests:./src:." | ||
set -o pipefail | ||
pytest --ignore=tests/test_documentation.py --ignore=tests/compatibility_tests \ | ||
pytest ${{ env.MULTINODE_TEST_ARGS }} --ignore=tests/test_documentation.py --ignore=tests/compatibility_tests \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems MULTINODE_TEST_ARGS is not passed in correctly (or maybe is not populated correctly in the first place?) Also, in the next line, we fail the build if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is passed in correctly for multinode runs. Please check this 2 shard 1 replica run: https://github.com/marqo-ai/marqo/actions/runs/13106217962/job/36561470973#step:9:15 MULTINODE_TEST_ARGS will be empty string for 1 shard and 0 replicas. Maybe that's the one you saw. |
||
--durations=100 --cov=src --cov-branch --cov-context=test --cov-fail-under=69 \ | ||
--cov-report=html:cov_html --cov-report=xml:cov.xml --cov-report term:skip-covered \ | ||
--md-report --md-report-flavor gfm --md-report-output pytest_result_summary.md \ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Runs unit tests on 4 cases: | ||
# 1. single node vespa | ||
# 2. multinode vespa: 1 shard, 1 replica | ||
# 3. multinode vespa: 2 shard, 0 replicas | ||
# 4. multinode vespa: 2 shards, 1 replicas | ||
# Runs only once on PR approval | ||
|
||
name: Unit Tests with Shards and Replicas | ||
|
||
on: | ||
workflow_dispatch: | ||
pull_request_review: | ||
types: [submitted] | ||
branches: | ||
- mainline | ||
- 'releases/*' | ||
|
||
permissions: | ||
contents: read | ||
|
||
jobs: | ||
Unit-Tests-1-Shard-0-Replica: | ||
papa99do marked this conversation as resolved.
Show resolved
Hide resolved
|
||
uses: ./.github/workflows/unit_test_200gb_CI.yml | ||
secrets: inherit | ||
if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved' | ||
with: | ||
number_of_shards: 1 | ||
number_of_replicas: 0 | ||
|
||
Unit-Tests-1-Shard-1-Replica: | ||
uses: ./.github/workflows/unit_test_200gb_CI.yml | ||
secrets: inherit | ||
if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved' | ||
with: | ||
number_of_shards: 1 | ||
number_of_replicas: 1 | ||
|
||
Unit-Tests-2-Shard-0-Replica: | ||
uses: ./.github/workflows/unit_test_200gb_CI.yml | ||
secrets: inherit | ||
if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved' | ||
with: | ||
number_of_shards: 2 | ||
number_of_replicas: 0 | ||
|
||
Unit-Tests-2-Shard-1-Replica: | ||
uses: ./.github/workflows/unit_test_200gb_CI.yml | ||
secrets: inherit | ||
if: github.event_name == 'workflow_dispatch' || github.event.review.state == 'approved' | ||
with: | ||
number_of_shards: 2 | ||
number_of_replicas: 1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Setting up Vespa locally | ||
When running Marqo or the unit test suite locally, a Vespa node or cluster needs to be running. To assist with this, | ||
this directory comes with scripts to set up either a single node (1 container) or multinode-HA Vespa on your machine. | ||
|
||
### Set Vespa version | ||
- By default, this script will use Vespa 8.431.32, as defined in `vespa_local.py`. To change it, set the `VESPA_VERSION` | ||
variable to the desired version. For example: | ||
```commandline | ||
export VESPA_VERSION="latest" | ||
``` | ||
## Single Node Vespa (default & recommended) | ||
- Runs 1 Vespa container on your machine. This serves as the config, api, and content node. | ||
- This is equivalent to running Vespa with 0 replicas and 1 shard. | ||
- Start with this command: | ||
```commandline | ||
python vespa_local.py start | ||
``` | ||
- This will run the Vespa docker container then copy the `services.xml` file from the `singlenode/` directory to | ||
this directory. This will be bundled into the Vespa application upon deployment. | ||
|
||
## Multi-node Vespa | ||
- Runs a Vespa cluster with the following nodes: | ||
- 3 config nodes | ||
- `m` content nodes, where `m` is `number_of_shards * (1 + number_of_replicas)` | ||
- `n` API nodes, where `n` is `max(2, number_of_content_nodes)` | ||
- For example, with 2 shards and 1 replica, it will run 4 content nodes and 2 API nodes. | ||
- Start with this command: | ||
```commandline | ||
python vespa_local.py start --Shards 2 --Replicas 1 | ||
``` | ||
|
||
## Deployment | ||
- After starting the Vespa node(s), you can deploy the Vespa application with the files in this directory using: | ||
```commandline | ||
python vespa_local.py deploy-config | ||
``` | ||
- For single node, you can check for readiness using: | ||
``` | ||
curl -s http://localhost:19071/state/v1/health | ||
``` | ||
- For multi-node, the start script will output a list of URLs corresponding to the API and content nodes. | ||
You can curl each one to check for readiness. | ||
|
||
## Other Commands | ||
### Stop Vespa | ||
```commandline | ||
python vespa_local.py stop | ||
``` | ||
### Restart Vespa | ||
```commandline | ||
python vespa_local.py restart | ||
``` | ||
|
||
## Notes | ||
- When running other commands in this script (stop, restart), it will check for the presence of a container named | ||
`vespa`, and will assume setup is single node if it finds one. If not, it will assume setup is multi-node. | ||
- For multi-node, expect config and API nodes to take ~1gb of memory, while content nodes take ~500mb each. Adjust your | ||
resource allotment accordingly. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<?xml version="1.0" encoding="utf-8" ?> | ||
<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> | ||
<!-- This is the services.xml for a single node Vespa setup --> | ||
<services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties"> | ||
<container id="default" version="1.0"> | ||
<document-api/> | ||
<search/> | ||
<nodes> | ||
<jvm options="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" /> | ||
<node hostalias="node1"/> | ||
</nodes> | ||
</container> | ||
<content id="content_default" version="1.0"> | ||
<redundancy>2</redundancy> | ||
<documents> | ||
<document type="test_vespa_client" mode="index"/> | ||
</documents> | ||
<nodes> | ||
<node hostalias="node1" distribution-key="0"/> | ||
</nodes> | ||
</content> | ||
|
||
</services> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This step should be run after
Check-Changes
, and should be run only if check-changes returns true:if: ${{ needs.Check-Changes.outputs.doc_only == 'false' }} # Run only if there are non-documentation changes