Skip to content

Commit

Permalink
ci: Enable collection of logs and metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
marun committed Mar 19, 2024
1 parent 6fefaaf commit c1602f3
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 9 deletions.
63 changes: 59 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ on:
pull_request:

env:
tmpnet_data_path: ~/.tmpnet/networks/1000

grafana_url: https://grafana-experimental.avax-dev.network/d/kBQpRdWnk/avalanche-main-dashboard?orgId=1&refresh=10s&var-filter=is_ephemeral_node%7C%3D%7Cfalse&var-filter=gh_repo%7C%3D%7Cava-labs%2Fsubnet-evm&var-filter=gh_run_id%7C%3D%7C${{ github.run_id }}&var-filter=gh_run_attempt%7C%3D%7C${{ github.run_attempt }}
jobs:
lint_test:
name: Lint
Expand Down Expand Up @@ -125,15 +124,43 @@ jobs:
- name: Build Subnet-EVM Plugin Binary
shell: bash
run: ./scripts/build.sh /tmp/e2e-test/avalanchego/plugins/srEXiWaHuhNyGwPUi444Tu47ZEDwxTWrbQiuD7FmgSAQ6X7Dy
- name: Start prometheus
shell: bash
run: bash -x ./scripts/run_prometheus.sh
env:
PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }}
PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }}
- name: Start promtail
shell: bash
run: bash -x ./scripts/run_promtail.sh
env:
LOKI_ID: ${{ secrets.LOKI_ID }}
LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }}
- name: Notify of metrics availability
shell: bash
run: .github/workflows/notify-metrics-availability.sh
env:
GRAFANA_URL: ${{ env.grafana_url }}
GH_JOB_ID: ${{ github.job }}
- name: Run Warp E2E Tests
shell: bash
run: AVALANCHEGO_BUILD_PATH=/tmp/e2e-test/avalanchego ./scripts/run_ginkgo_warp.sh
env:
GH_REPO: ${{ github.repository }}
GH_WORKFLOW: ${{ github.workflow }}
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_NUMBER: ${{ github.run_number }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
GH_JOB_ID: ${{ github.job }}
- name: Upload tmpnet network dir for warp testing
if: always()
uses: actions/upload-artifact@v4
with:
name: warp-tmpnet-data
path: ${{ env.tmpnet_data_path }}
path: |
~/.tmpnet/networks
~/.tmpnet/prometheus/prometheus.log
~/.tmpnet/promtail/promtail.log
if-no-files-found: error
e2e_load:
name: e2e load tests
Expand All @@ -154,15 +181,43 @@ jobs:
- name: Build Subnet-EVM Plugin Binary
shell: bash
run: ./scripts/build.sh /tmp/e2e-test/avalanchego/plugins/srEXiWaHuhNyGwPUi444Tu47ZEDwxTWrbQiuD7FmgSAQ6X7Dy
- name: Start prometheus
shell: bash
run: bash -x ./scripts/run_prometheus.sh
env:
PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }}
PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }}
- name: Start promtail
shell: bash
run: bash -x ./scripts/run_promtail.sh
env:
LOKI_ID: ${{ secrets.LOKI_ID }}
LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }}
- name: Notify of metrics availability
shell: bash
run: .github/workflows/notify-metrics-availability.sh
env:
GRAFANA_URL: ${{ env.grafana_url }}
GH_JOB_ID: ${{ github.job }}
- name: Run E2E Load Tests
shell: bash
run: AVALANCHEGO_BUILD_PATH=/tmp/e2e-test/avalanchego ./scripts/run_ginkgo_load.sh
env:
GH_REPO: ${{ github.repository }}
GH_WORKFLOW: ${{ github.workflow }}
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_NUMBER: ${{ github.run_number }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
GH_JOB_ID: ${{ github.job }}
- name: Upload tmpnet network dir for load testing
if: always()
uses: actions/upload-artifact@v4
with:
name: load-tmpnet-data
path: ${{ env.tmpnet_data_path }}
path: |
~/.tmpnet/networks
~/.tmpnet/prometheus/prometheus.log
~/.tmpnet/promtail/promtail.log
if-no-files-found: error

build_image:
Expand Down
19 changes: 19 additions & 0 deletions .github/workflows/notify-metrics-availability.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

set -euo pipefail

# Timestamps are in seconds
from_timestamp="$(date '+%s')"
monitoring_period=900 # 15 minutes
to_timestamp="$((from_timestamp + monitoring_period))"

# Grafana expects microseconds, so pad timestamps with 3 zeros
metrics_url="${GRAFANA_URL}&var-filter=gh_job_id%7C%3D%7C${GH_JOB_ID}&from=${from_timestamp}000&to=${to_timestamp}000"

# Optionally ensure that the link displays metrics only for the shared
# network rather than mixing it with the results for private networks.
if [[ -n "${FILTER_BY_OWNER:-}" ]]; then
metrics_url="${metrics_url}&var-filter=network_owner%7C%3D%7C${FILTER_BY_OWNER}"
fi

echo "::notice links::metrics ${metrics_url}"
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.21

require (
github.com/VictoriaMetrics/fastcache v1.10.0
github.com/ava-labs/avalanchego v1.11.2
github.com/ava-labs/avalanchego v1.11.3-0.20240318185506-35c4149e089b
github.com/cespare/cp v0.1.0
github.com/davecgh/go-spew v1.1.1
github.com/deckarep/golang-set/v2 v2.1.0
Expand All @@ -15,7 +15,6 @@ require (
github.com/fsnotify/fsnotify v1.6.0
github.com/gballet/go-libpcsclite v0.0.0-20191108122812-4678299bea08
github.com/go-cmd/cmd v1.4.1
github.com/golang/protobuf v1.5.3
github.com/google/uuid v1.6.0
github.com/gorilla/rpc v1.2.0
github.com/gorilla/websocket v1.4.2
Expand Down Expand Up @@ -82,6 +81,7 @@ require (
github.com/go-stack/stack v1.8.1 // indirect
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb // indirect
github.com/google/btree v1.1.2 // indirect
github.com/google/go-cmp v0.6.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ github.com/ajg/form v1.5.1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8=
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/ava-labs/avalanchego v1.11.2 h1:8iodZ+RjqpRwHdiXPPtvaNt72qravge7voGzw3yPRzg=
github.com/ava-labs/avalanchego v1.11.2/go.mod h1:oTVnF9idL57J4LM/6RByTmKhI4QvV6OCnF99ysyBljE=
github.com/ava-labs/avalanchego v1.11.3-0.20240318185506-35c4149e089b h1:1GnaPVt62+ONLpkU0gObC3x5EOw+dcbZEbjGXD9/4j0=
github.com/ava-labs/avalanchego v1.11.3-0.20240318185506-35c4149e089b/go.mod h1:Yhtr0gRX0QFBb1Y3WQtymp3vm2qIukQqbkBQ/BP6Bus=
github.com/ava-labs/coreth v0.13.2-0.20240304213436-8afbf2d68461 h1:SIwGF3eVEwmexLm7is/MvG7W5sbmpGXaUT6RfUPP3jw=
github.com/ava-labs/coreth v0.13.2-0.20240304213436-8afbf2d68461/go.mod h1:v24MTMbxFSvyM7YeQFyWiXjIzVo2+UVs7tgH7xrByew=
github.com/aymerick/raymond v2.0.3-0.20180322193309-b565731e1464+incompatible/go.mod h1:osfaiScAUVup+UC9Nfq76eWqDhXlp+4UYaA8uhTBO6g=
Expand Down
120 changes: 120 additions & 0 deletions scripts/run_prometheus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env bash

set -euo pipefail

# Starts a prometheus instance in agent-mode, forwarding to a central
# instance. Intended to enable metrics collection from temporary networks running
# locally and in CI.
#
# The prometheus instance will remain running in the background and will forward
# metrics to the central instance for all tmpnet networks.
#
# To stop it:
#
# $ kill -9 `cat ~/.tmpnet/prometheus/run.pid` && rm ~/.tmpnet/prometheus/run.pid
#

# e.g.,
# PROMETHEUS_ID=<id> PROMETHEUS_PASSWORD=<password> ./scripts/run_prometheus.sh
if ! [[ "$0" =~ scripts/run_prometheus.sh ]]; then
echo "must be run from repository root"
exit 255
fi

PROMETHEUS_WORKING_DIR="${HOME}/.tmpnet/prometheus"
PIDFILE="${PROMETHEUS_WORKING_DIR}"/run.pid

# First check if an agent-mode prometheus is already running. A single instance can collect
# metrics from all local temporary networks.
if pgrep --pidfile="${PIDFILE}" -f 'prometheus.*enable-feature=agent' &> /dev/null; then
echo "prometheus is already running locally with --enable-feature=agent"
exit 0
fi

PROMETHEUS_URL="${PROMETHEUS_URL:-https://prometheus-experimental.avax-dev.network}"
if [[ -z "${PROMETHEUS_URL}" ]]; then
echo "Please provide a value for PROMETHEUS_URL"
exit 1
fi

PROMETHEUS_ID="${PROMETHEUS_ID:-}"
if [[ -z "${PROMETHEUS_ID}" ]]; then
echo "Please provide a value for PROMETHEUS_ID"
exit 1
fi

PROMETHEUS_PASSWORD="${PROMETHEUS_PASSWORD:-}"
if [[ -z "${PROMETHEUS_PASSWORD}" ]]; then
echo "Plase provide a value for PROMETHEUS_PASSWORD"
exit 1
fi

# This was the LTS version when this script was written. Probably not
# much reason to update it unless something breaks since the usage
# here is only to collect metrics from temporary networks.
VERSION="2.45.3"

# Ensure the prometheus command is locally available
CMD=prometheus
if ! command -v "${CMD}" &> /dev/null; then
# Try to use a local version
CMD="${PWD}/bin/prometheus"
if ! command -v "${CMD}" &> /dev/null; then
echo "prometheus not found, attempting to install..."

# Determine the arch
if which sw_vers &> /dev/null; then
echo "on macos, only amd64 binaries are available so rosetta is required on apple silicon machines."
echo "to avoid using rosetta, install via homebrew: brew install prometheus"
DIST=darwin
else
ARCH="$(uname -i)"
if [[ "${ARCH}" != "x86_64" ]]; then
echo "on linux, only amd64 binaries are available. manual installation of prometheus is required."
exit 1
else
DIST="linux"
fi
fi

# Install the specified release
PROMETHEUS_FILE="prometheus-${VERSION}.${DIST}-amd64"
URL="https://github.com/prometheus/prometheus/releases/download/v${VERSION}/${PROMETHEUS_FILE}.tar.gz"
curl -s -L "${URL}" | tar zxv -C /tmp > /dev/null
mkdir -p "$(dirname "${CMD}")"
cp /tmp/"${PROMETHEUS_FILE}/prometheus" "${CMD}"
fi
fi

# Configure prometheus
FILE_SD_PATH="${PROMETHEUS_WORKING_DIR}/file_sd_configs"
mkdir -p "${FILE_SD_PATH}"

echo "writing configuration..."
cat >"${PROMETHEUS_WORKING_DIR}"/prometheus.yaml <<EOL
# my global config
global:
# Make sure this value takes into account the network-shutdown-delay in tests/fixture/e2e/env.go
scrape_interval: 10s # Default is every 1 minute.
evaluation_interval: 10s # The default is every 1 minute.
scrape_timeout: 5s # The default is every 10s
scrape_configs:
- job_name: "avalanchego"
metrics_path: "/ext/metrics"
file_sd_configs:
- files:
- '${FILE_SD_PATH}/*.json'
remote_write:
- url: "${PROMETHEUS_URL}/api/v1/write"
basic_auth:
username: "${PROMETHEUS_ID}"
password: "${PROMETHEUS_PASSWORD}"
EOL

echo "starting prometheus..."
cd "${PROMETHEUS_WORKING_DIR}"
nohup "${CMD}" --config.file=prometheus.yaml --web.listen-address=localhost:0 --enable-feature=agent > prometheus.log 2>&1 &
echo $! > "${PIDFILE}"
echo "running with pid $(cat "${PIDFILE}")"
115 changes: 115 additions & 0 deletions scripts/run_promtail.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env bash

set -euo pipefail

# Starts a promtail instance to collect logs from temporary networks
# running locally and in CI.
#
# The promtail instance will remain running in the background and will forward
# logs to the central instance for all tmpnet networks.
#
# To stop it:
#
# $ kill -9 `cat ~/.tmpnet/promtail/run.pid` && rm ~/.tmpnet/promtail/run.pid
#

# e.g.,
# LOKI_ID=<id> LOKI_PASSWORD=<password> ./scripts/run_promtail.sh
if ! [[ "$0" =~ scripts/run_promtail.sh ]]; then
echo "must be run from repository root"
exit 255
fi

PROMTAIL_WORKING_DIR="${HOME}/.tmpnet/promtail"
PIDFILE="${PROMTAIL_WORKING_DIR}"/run.pid

# First check if promtail is already running. A single instance can
# collect logs from all local temporary networks.
if pgrep --pidfile="${PIDFILE}" &> /dev/null; then
echo "promtail is already running"
exit 0
fi

LOKI_URL="${LOKI_URL:-https://loki-experimental.avax-dev.network}"
if [[ -z "${LOKI_URL}" ]]; then
echo "Please provide a value for LOKI_URL"
exit 1
fi

LOKI_ID="${LOKI_ID:-}"
if [[ -z "${LOKI_ID}" ]]; then
echo "Please provide a value for LOKI_ID"
exit 1
fi

LOKI_PASSWORD="${LOKI_PASSWORD:-}"
if [[ -z "${LOKI_PASSWORD}" ]]; then
echo "Plase provide a value for LOKI_PASSWORD"
exit 1
fi

# Version as of this writing
VERSION="v2.9.5"

# Ensure the promtail command is locally available
CMD=promtail
if ! command -v "${CMD}" &> /dev/null; then
# Try to use a local version
CMD="${PWD}/bin/promtail"
if ! command -v "${CMD}" &> /dev/null; then
echo "promtail not found, attempting to install..."
# Determine the arch
if which sw_vers &> /dev/null; then
DIST="darwin-$(uname -m)"
else
ARCH="$(uname -i)"
if [[ "${ARCH}" == "aarch64" ]]; then
ARCH="arm64"
elif [[ "${ARCH}" == "x86_64" ]]; then
ARCH="amd64"
fi
DIST="linux-${ARCH}"
fi

# Install the specified release
PROMTAIL_FILE="promtail-${DIST}"
ZIP_PATH="/tmp/${PROMTAIL_FILE}.zip"
BIN_DIR="$(dirname "${CMD}")"
URL="https://github.com/grafana/loki/releases/download/${VERSION}/promtail-${DIST}.zip"
curl -L -o "${ZIP_PATH}" "${URL}"
unzip "${ZIP_PATH}" -d "${BIN_DIR}"
mv "${BIN_DIR}/${PROMTAIL_FILE}" "${CMD}"
fi
fi

# Configure promtail
FILE_SD_PATH="${PROMTAIL_WORKING_DIR}/file_sd_configs"
mkdir -p "${FILE_SD_PATH}"

echo "writing configuration..."
cat >"${PROMTAIL_WORKING_DIR}"/promtail.yaml <<EOL
server:
http_listen_port: 0
grpc_listen_port: 0
positions:
filename: "${PROMTAIL_WORKING_DIR}/positions.yaml"
client:
url: "${LOKI_URL}/api/prom/push"
basic_auth:
username: "${LOKI_ID}"
password: "${LOKI_PASSWORD}"
scrape_configs:
- job_name: "avalanchego"
file_sd_configs:
- files:
- '${FILE_SD_PATH}/*.json'
EOL

echo "starting promtail..."
cd "${PROMTAIL_WORKING_DIR}"
nohup "${CMD}" -config.file=promtail.yaml > promtail.log 2>&1 &
echo $! > "${PIDFILE}"
echo "running with pid $(cat "${PIDFILE}")"
Loading

0 comments on commit c1602f3

Please sign in to comment.