Skip to content

Commit 718cb72

Browse files
committed
[tmpnet] Enable support for deployment to kubernetes
1 parent 59a0074 commit 718cb72

28 files changed

+1002
-63
lines changed

.github/actions/run-monitored-tmpnet-cmd/action.yml

+15-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ inputs:
88
run_env:
99
description: 'a string containing env vars for the command e.g. "MY_VAR1=foo MY_VAR2=bar"'
1010
default: ''
11+
runtime:
12+
description: 'the tmpnet runtime being used'
13+
default: 'process'
1114
filter_by_owner:
1215
default: ''
1316
artifact_prefix:
@@ -84,7 +87,7 @@ runs:
8487
# easy way to compose custom actions for use by other repos
8588
# without running into versioning issues.
8689
- name: Upload tmpnet data
87-
if: always()
90+
if: always() && (inputs.runtime == 'process')
8891
uses: actions/upload-artifact@v4
8992
with:
9093
name: ${{ inputs.artifact_prefix }}-tmpnet-data
@@ -93,3 +96,14 @@ runs:
9396
~/.tmpnet/prometheus/prometheus.log
9497
~/.tmpnet/promtail/promtail.log
9598
if-no-files-found: error
99+
- name: Export kind logs
100+
if: always() && (inputs.runtime == 'kube')
101+
shell: bash
102+
run: kind export logs /tmp/kind-logs
103+
- name: Upload kind logs
104+
if: always() && (inputs.runtime == 'kube')
105+
uses: actions/upload-artifact@v4
106+
with:
107+
name: ${{ inputs.artifact_prefix }}-kind-logs
108+
path: /tmp/kind-logs
109+
if-no-files-found: error

.github/workflows/ci.yml

+15
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,21 @@ jobs:
7676
prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }}
7777
loki_username: ${{ secrets.LOKI_ID || '' }}
7878
loki_password: ${{ secrets.LOKI_PASSWORD || '' }}
79+
e2e_kube:
80+
runs-on: ubuntu-latest
81+
steps:
82+
- uses: actions/checkout@v4
83+
- uses: ./.github/actions/setup-go-for-project
84+
- uses: ./.github/actions/run-monitored-tmpnet-cmd
85+
with:
86+
run: ./scripts/run_task.sh test-e2e-kube-ci
87+
runtime: kube
88+
artifact_prefix: e2e-kube
89+
filter_by_owner: avalanchego-e2e
90+
prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }}
91+
prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }}
92+
loki_username: ${{ secrets.LOKI_ID || '' }}
93+
loki_password: ${{ secrets.LOKI_PASSWORD || '' }}
7994
e2e_existing_network:
8095
runs-on: ubuntu-latest
8196
steps:

Taskfile.yml

+12
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,18 @@ tasks:
173173
- task: build-xsvm
174174
- cmd: bash -x ./scripts/tests.e2e.existing.sh {{.CLI_ARGS}}
175175

176+
test-e2e-kube:
177+
desc: Runs e2e tests against a network deployed to kube
178+
cmds:
179+
- cmd: bash -x ./scripts/tests.e2e.kube.sh {{.CLI_ARGS}}
180+
181+
test-e2e-kube-ci:
182+
desc: Runs e2e tests against a network deployed to kube [serially]
183+
env:
184+
E2E_SERIAL: 1
185+
cmds:
186+
- task: test-e2e-kube
187+
176188
# To use a different fuzz time, run `task test-fuzz FUZZTIME=[value in seconds]`.
177189
# A value of `-1` will run until it encounters a failing output.
178190

flake.lock

+5-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

flake.nix

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
# Kube tools
4444
kubectl # Kubernetes CLI
45+
k9s # Kubernetes TUI
4546
kind # Kubernetes-in-Docker
4647
kubernetes-helm # Helm CLI (Kubernetes package manager)
4748
self.packages.${system}.kind-with-registry # Script installing kind configured with a local registry

scripts/build_antithesis_images.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,5 +87,5 @@ else
8787
"${AVALANCHE_PATH}/build/antithesis/xsvm" \
8888
"AVALANCHEGO_PATH=${AVALANCHE_PATH}/build/avalanchego AVAGO_PLUGIN_DIR=${AVALANCHE_PATH}/build/plugins"
8989

90-
build_antithesis_images_for_avalanchego "${TEST_SETUP}" "${IMAGE_PREFIX}" "${AVALANCHE_PATH}/vms/example/xsvm/Dockerfile"
90+
build_antithesis_images_for_avalanchego "${TEST_SETUP}" "${IMAGE_PREFIX}" "${AVALANCHE_PATH}/tests/antithesis/xsvm/Dockerfile.node"
9191
fi

scripts/build_xsvm_image.sh

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
if ! [[ "$0" =~ scripts/build_xsvm_image.sh ]]; then
6+
echo "must be run from repository root"
7+
exit 255
8+
fi
9+
10+
# Directory above this script
11+
AVALANCHE_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )"; cd .. && pwd )
12+
13+
# TODO(marun) This image name should be configurable
14+
DOCKER_IMAGE="localhost:5001/avalanchego"
15+
16+
# Build the avalancehgo node image
17+
FORCE_TAG_LATEST=1 SKIP_BUILD_RACE=1 DOCKER_IMAGE="${DOCKER_IMAGE}" ./scripts/build_image.sh
18+
19+
# TODO(marun) conditionally push the image to the registry
20+
GO_VERSION="$(go list -m -f '{{.GoVersion}}')"
21+
docker buildx build --build-arg GO_VERSION="${GO_VERSION}" --build-arg AVALANCHEGO_NODE_IMAGE="${DOCKER_IMAGE}" \
22+
--push -t "${DOCKER_IMAGE}-xsvm" -f "${AVALANCHE_PATH}/vms/example/xsvm/Dockerfile" .

scripts/tests.e2e.bootstrap_monitor.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ if ! [[ "$0" =~ scripts/tests.e2e.bootstrap_monitor.sh ]]; then
99
exit 255
1010
fi
1111

12+
export KUBECONFIG="$HOME/.kube/config"
13+
1214
./bin/tmpnetctl start-kind-cluster
1315

14-
KUBECONFIG="$HOME/.kube/config" ./bin/ginkgo -v ./tests/fixture/bootstrapmonitor/e2e
16+
./bin/ginkgo -v ./tests/fixture/bootstrapmonitor/e2e

scripts/tests.e2e.kube.sh

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
# Run e2e tests against nodes deployed to a kind cluster.
6+
7+
# TODO(marun)
8+
# - Support testing against a remote cluster
9+
10+
if ! [[ "$0" =~ scripts/tests.e2e.kube.sh ]]; then
11+
echo "must be run from repository root"
12+
exit 255
13+
fi
14+
15+
export KUBECONFIG="${KUBECONFIG:-$HOME/.kube/config}"
16+
17+
./bin/tmpnetctl start-kind-cluster
18+
19+
if [[ -z "${SKIP_BUILD_IMAGE:-}" ]]; then
20+
bash -x ./scripts/build_xsvm_image.sh
21+
fi
22+
23+
# Avoid having the test suite start local collectors since collection
24+
# is only required from the nodes running in the kind cluster
25+
TMPNET_START_COLLECTORS='' E2E_SERIAL=1 PATH="${PWD}/bin:$PATH" \
26+
bash -x ./scripts/tests.e2e.sh --runtime=kube --kube-image=localhost:5001/avalanchego-xsvm

scripts/tests.e2e.sh

+11-4
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,15 @@ source ./scripts/constants.sh
2222

2323
# Ensure an absolute path to avoid dependency on the working directory
2424
# of script execution.
25-
AVALANCHEGO_PATH="$(realpath "${AVALANCHEGO_PATH:-./build/avalanchego}")"
26-
E2E_ARGS="--avalanchego-path=${AVALANCHEGO_PATH}"
25+
E2E_ARGS="${*:-}"
26+
if ! [[ "${E2E_ARGS}" =~ "--runtime=kube" ]]; then
27+
# If not running in kubernetes, use the local avalanchego binary
28+
AVALANCHEGO_PATH="$(realpath "${AVALANCHEGO_PATH:-./build/avalanchego}")"
29+
E2E_ARGS+=" --avalanchego-path=${AVALANCHEGO_PATH}"
30+
31+
# Enable subnet testing by building the xsvm binary
32+
./scripts/build_xsvm.sh
33+
fi
2734

2835
#################################
2936
# Determine ginkgo args
@@ -54,5 +61,5 @@ else
5461
fi
5562

5663
#################################
57-
# shellcheck disable=SC2086
58-
./bin/ginkgo ${GINKGO_ARGS} -v ./tests/e2e -- "${E2E_ARGS[@]}" "${@}"
64+
# shellcheck disable=SC2086,SC2068
65+
./bin/ginkgo ${GINKGO_ARGS} -v ./tests/e2e -- ${E2E_ARGS[@]}

tests/e2e/faultinjection/duplicate_node_id.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,14 @@ var _ = ginkgo.Describe("Duplicate node handling", func() {
2525
ginkgo.It("should ensure that a given Node ID (i.e. staking keypair) can be used at most once on a network", func() {
2626
network := e2e.GetEnv(tc).GetNetwork()
2727

28+
if network.DefaultRuntimeConfig.Kube != nil {
29+
// Enabling this test for kube requires supporting a flexible name mapping
30+
ginkgo.Skip("This test is not supported on kube to avoid having to deviate from composing the statefulset name with the network uuid + nodeid")
31+
}
32+
2833
tc.By("creating new node")
2934
node1 := e2e.AddEphemeralNode(tc, network, tmpnet.NewEphemeralNode(tmpnet.FlagsMap{}))
30-
e2e.WaitForHealthy(tc, node1)
35+
require.NoError(node1.WaitForHealthy(tc.DefaultContext()))
3136

3237
tc.By("checking that the new node is connected to its peers")
3338
checkConnectedPeers(tc, network.Nodes, node1)

tests/e2e/p/l1.go

+2-5
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ var _ = e2e.DescribePChain("[L1]", func() {
175175
subnetGenesisNode := e2e.AddEphemeralNode(tc, env.GetNetwork(), tmpnet.NewEphemeralNode(tmpnet.FlagsMap{
176176
config.TrackSubnetsKey: subnetID.String(),
177177
}))
178+
e2e.WaitForHealthy(tc, subnetGenesisNode)
178179

179180
genesisNodePoP, err := subnetGenesisNode.GetProofOfPossession()
180181
require.NoError(err)
@@ -351,18 +352,14 @@ var _ = e2e.DescribePChain("[L1]", func() {
351352
subnetRegisterNode := e2e.AddEphemeralNode(tc, env.GetNetwork(), tmpnet.NewEphemeralNode(tmpnet.FlagsMap{
352353
config.TrackSubnetsKey: subnetID.String(),
353354
}))
355+
e2e.WaitForHealthy(tc, subnetRegisterNode)
354356

355357
registerNodePoP, err := subnetRegisterNode.GetProofOfPossession()
356358
require.NoError(err)
357359

358360
registerNodePK, err := bls.PublicKeyFromCompressedBytes(registerNodePoP.PublicKey[:])
359361
require.NoError(err)
360362

361-
tc.By("ensuring the subnet nodes are healthy", func() {
362-
e2e.WaitForHealthy(tc, subnetGenesisNode)
363-
e2e.WaitForHealthy(tc, subnetRegisterNode)
364-
})
365-
366363
tc.By("creating the RegisterL1ValidatorMessage")
367364
expiry := uint64(time.Now().Add(expiryDelay).Unix()) // This message will expire in 5 minutes
368365
registerL1ValidatorMessage, err := warpmessage.NewRegisterL1Validator(

tests/fixture/bootstrapmonitor/e2e/e2e_test.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -261,14 +261,16 @@ func buildImage(tc tests.TestContext, imageName string, forceNewHash bool, scrip
261261
require.NoError(err, "Image build failed: %s", output)
262262
}
263263

264-
func newNodeStatefulSet(name string, flags map[string]string) *appsv1.StatefulSet {
264+
func newNodeStatefulSet(name string, flags tmpnet.FlagsMap) *appsv1.StatefulSet {
265265
statefulSet := tmpnet.NewNodeStatefulSet(
266266
name,
267+
true, /* generateName */
267268
latestAvalanchegoImage,
268269
nodeContainerName,
269270
volumeName,
270271
volumeSize,
271272
nodeDataDir,
273+
nil,
272274
flags,
273275
)
274276

tests/fixture/e2e/env.go

+13-4
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,20 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork
193193
"not enough pre-funded keys for the requested number of parallel test processes",
194194
)
195195

196-
uris := network.GetNodeURIs()
196+
// TODO(marun) Maybe this should be part of tmpnet/network.go?
197+
uris := make([]tmpnet.NodeURI, len(network.Nodes))
198+
for i, node := range network.Nodes {
199+
uri, cancel, err := node.GetLocalURI(tc.DefaultContext())
200+
require.NoError(err)
201+
tc.DeferCleanup(cancel)
202+
uris[i] = tmpnet.NodeURI{
203+
NodeID: node.NodeID,
204+
URI: uri,
205+
}
206+
}
197207
require.NotEmpty(uris, "network contains no nodes")
198208
tc.Log().Info("network nodes are available",
199-
zap.Any("uris", uris),
209+
zap.Any("nodeURIs", uris),
200210
)
201211

202212
return &TestEnvironment{
@@ -214,8 +224,7 @@ func (te *TestEnvironment) GetRandomNodeURI() tmpnet.NodeURI {
214224
r := rand.New(rand.NewSource(time.Now().Unix())) //#nosec G404
215225
nodeURI := te.URIs[r.Intn(len(te.URIs))]
216226
te.testContext.Log().Info("targeting random node",
217-
zap.Stringer("nodeID", nodeURI.NodeID),
218-
zap.String("uri", nodeURI.URI),
227+
zap.Any("nodeURI", nodeURI),
219228
)
220229
return nodeURI
221230
}

tests/fixture/e2e/flags.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,16 @@ func (v *FlagVars) NodeRuntimeConfig() (*tmpnet.NodeRuntimeConfig, error) {
8383
return v.startNetworkVars.GetNodeRuntimeConfig()
8484
}
8585

86+
// TODO(marun) Rename to StartLocalCollector
8687
func (v *FlagVars) StartCollectors() bool {
88+
// This only prompts the deployment of local collectors.
89+
// TODO(marun) Maybe differentiate between type of collector?
8790
return v.startCollectors
8891
}
8992

9093
func (v *FlagVars) CheckMonitoring() bool {
91-
return v.checkMonitoring
94+
// TODO(marun) Enable this check for kube in a subsequent PR
95+
return v.startNetworkVars.ProcessRuntimeConfigured() && v.checkMonitoring
9296
}
9397

9498
func (v *FlagVars) NetworkDir() string {

tests/fixture/tmpnet/README.md

+20
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ the following non-test files:
6161
|:----------------------------|:---------------|:-----------------------------------------------------------------------|
6262
| flags/ | | Directory defining flags usable with both stdlib flags and spf13/pflag |
6363
| flags/common.go | | Defines type definitions common across other files |
64+
| flags/kube_config.go | | Defines flags configuring the kubeconfig of the kube runtime |
65+
| flags/kube_runtime.go | | Defines flags configuring the kube node runtime |
6466
| flags/process_runtime.go | | Defines flags configuring the process node runtime |
6567
| flags/runtime.go | | Defines flags configuring node runtime |
6668
| flags/start_network.go | | Defines flags configuring network start |
@@ -72,6 +74,7 @@ the following non-test files:
7274
| flagsmap.go | FlagsMap | Simplifies configuration of avalanchego flags |
7375
| genesis.go | | Creates test genesis |
7476
| kube.go | | Library for Kubernetes interaction |
77+
| kube_runtime.go | | Orchestrates nodes running in Kubernetes |
7578
| local_network.go | | Defines configuration for the default local network |
7679
| monitor_processes.go | | Enables collection of logs and metrics from local processes |
7780
| network.go | Network | Orchestrates and configures temporary networks |
@@ -293,6 +296,8 @@ The details required to configure a node's execution are written to
293296
runtime-specific details like the path of the avalanchego binary to
294297
start the node with.
295298

299+
TODO(marun) Separate process from kube-based network deployment
300+
296301
#### Flags
297302
[Top](#table-of-contents)
298303

@@ -311,6 +316,12 @@ The process details of a node are written by avalanchego to
311316
process, the URI of the node's API, and the address other nodes can
312317
use to bootstrap themselves (aka staking address).
313318

319+
## Kube-based networks
320+
321+
- `tmpnet` supports deploying nodes to kubernetes.
322+
- Each node will be deployed as a stateful set.
323+
- The naming convention for statefulsets will be [network uuid]-[first 8 characters of node ID excluding NodeID-]
324+
314325
## Monitoring
315326
[Top](#table-of-contents)
316327

@@ -484,3 +495,12 @@ github action with `filter_by_owner` set to the owner string for the
484495
shared network. This ensures that the link emitted by the annotation
485496
displays results for only the shared network of the job rather than
486497
mixing results from all the networks started for the job.
498+
499+
## Concurrent usage
500+
501+
The types (networks, nodes, etc) that tmpnet defines are not safe for
502+
concurrent usage. To avoid requiring an rpc daemon, tmpnet stores data
503+
on diskData is shared via the filesystem, and new instances can just
504+
be created for every usage. Since the Add to this that tmpnet isn't
505+
intended to be multi-user either. Maybe not optimal in terms of
506+
scalability, but much simpler to work with for that lack.

tests/fixture/tmpnet/defaults.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ const (
2121
// seconds provides a buffer in case of any delay in processing.
2222
DefaultValidatorStartTimeDiff = executor.SyncBound + 20*time.Second
2323

24-
DefaultNetworkTimeout = 2 * time.Minute
24+
// TODO(marun) Vary this between kube and process-based nodes
25+
// since the timing will be different
26+
DefaultNetworkTimeout = 4 * time.Minute
2527

2628
// Minimum required to ensure connectivity-based health checks will pass
2729
DefaultNodeCount = 2

0 commit comments

Comments
 (0)