Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add a health check endpoint #2670

Merged
merged 11 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .github/workflows/ci-lite.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,58 @@ jobs:
-n 1 \
-m 'name_cache'

test-healthcheck:
runs-on: ubuntu-latest
needs:
- meta
- build_action
container:
image: python:3.9-buster
services:
sc4s:
image: ${{ needs.meta.outputs.container_base }}
ports:
- 8090:8090
- 514:514
env:
SC4S_DEST_SPLUNK_HEC_DEFAULT_URL: https://splunk:8088
SC4S_DEST_SPLUNK_HEC_DEFAULT_TOKEN: 00000000-0000-0000-0000-000000000000
SC4S_LISTEN_STATUS_PORT: 8090 # the default is 8080
HEALTHCHECK_CHECK_QUEUE_SIZE: yes
HEALTHCHECK_MAX_QUEUE_SIZE: 10000
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: false
persist-credentials: false
- name: Install requests
run: pip3 install requests
- name: Return status 'healthy'
run: python3 tests/test_healthcheck_healthy.py --host sc4s --port 8090
- name: Return status 'queue size exceeded limit'
run: python3 tests/test_healthcheck_queue_size_limit.py --limit 10000 --host sc4s --port 8090

test-healthcheck-unit-tests:
runs-on: ubuntu-latest
needs:
- meta
- build_action
container:
image: python:3.9-buster
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: false
persist-credentials: false
- name: Install dependencies
run: |
pip3 install poetry
poetry install
- name: Run tests
run: poetry run pytest tests/test_healthcheck_unit_tests.py

release:
name: Release
runs-on: ubuntu-latest
Expand All @@ -362,6 +414,8 @@ jobs:
- test-container
- test-ipv4-name-cache
- test-ipv6-name-cache
- test-healthcheck
- test-healthcheck-unit-tests
steps:
- uses: actions/checkout@v4
with:
Expand Down
54 changes: 54 additions & 0 deletions .github/workflows/ci-main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,58 @@ jobs:
-n 1 \
-m 'name_cache'

test-healthcheck:
runs-on: ubuntu-latest
needs:
- meta
- build_action
container:
image: python:3.9-buster
services:
sc4s:
image: ${{ needs.meta.outputs.container_base }}
ports:
- 8090:8090
- 514:514
env:
SC4S_DEST_SPLUNK_HEC_DEFAULT_URL: https://splunk:8088
SC4S_DEST_SPLUNK_HEC_DEFAULT_TOKEN: 00000000-0000-0000-0000-000000000000
SC4S_LISTEN_STATUS_PORT: 8090 # the default is 8080
HEALTHCHECK_CHECK_QUEUE_SIZE: yes
HEALTHCHECK_MAX_QUEUE_SIZE: 10000
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: false
persist-credentials: false
- name: Install requests
run: pip3 install requests
- name: Return status 'healthy'
run: python3 tests/test_healthcheck_healthy.py --host sc4s --port 8090
- name: Return status 'queue size exceeded limit'
run: python3 tests/test_healthcheck_queue_size_limit.py --limit 10000 --host sc4s --port 8090

test-healthcheck-unit-tests:
runs-on: ubuntu-latest
needs:
- meta
- build_action
container:
image: python:3.9-buster
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: false
persist-credentials: false
- name: Install dependencies
run: |
pip3 install poetry
poetry install
- name: Run tests
run: poetry run pytest tests/test_healthcheck_unit_tests.py

mike:
runs-on: ubuntu-latest
if: ${{ github.ref == 'refs/heads/main' }} || ${{ github.ref == 'refs/heads/develop' }}
Expand Down Expand Up @@ -387,6 +439,8 @@ jobs:
- test-container
- test-ipv4-name-cache
- test-ipv6-name-cache
- test-healthcheck
- test-healthcheck-unit-tests
- mike
steps:
- uses: actions/checkout@v4
Expand Down
3 changes: 3 additions & 0 deletions charts/splunk-connect-for-syslog/templates/statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ spec:
- name: ietf-dflt-tls
containerPort: 5425
protocol: TCP
- name: health
containerPort: 8080
protocol: TCP
{{- if .Values.sc4s }}
{{- if .Values.sc4s.vendor_product }}
{{- range $vp := .Values.sc4s.vendor_product }}
Expand Down
5 changes: 5 additions & 0 deletions docs/gettingstarted/ansible-docker-podman.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sour
SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sourcetype=sc4s:events...
syslog-ng checking config
sc4s version=v1.36.0
Configuring health check port: 8080
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
starting syslog-ng
```

Expand Down
5 changes: 5 additions & 0 deletions docs/gettingstarted/ansible-docker-swarm.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sour
SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sourcetype=sc4s:events...
syslog-ng checking config
sc4s version=v1.36.0
Configuring health check port: 8080
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
starting syslog-ng
```

Expand Down
5 changes: 5 additions & 0 deletions docs/gettingstarted/ansible-mk8s.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,10 @@ SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sour
SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sourcetype=sc4s:events...
syslog-ng checking config
sc4s version=v1.36.0
Configuring health check port: 8080
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
starting syslog-ng
```
5 changes: 5 additions & 0 deletions docs/gettingstarted/docker-compose-MacOS.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ You should see events similar to those below in the output:
```ini
syslog-ng checking config
sc4s version=v1.36.0
Configuring health check port: 8080
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
starting syslog-ng
```

Expand Down
5 changes: 5 additions & 0 deletions docs/gettingstarted/docker-compose.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ You should see events similar to those below in the output:
```ini
syslog-ng checking config
sc4s version=v1.36.0
Configuring health check port: 8080
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
starting syslog-ng
```

Expand Down
5 changes: 5 additions & 0 deletions docs/gettingstarted/docker-systemd-general.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,11 @@ You should see events similar to those below in the output:
```ini
syslog-ng checking config
sc4s version=v1.36.0
Configuring health check port: 8080
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
starting syslog-ng
```

Expand Down
5 changes: 5 additions & 0 deletions docs/gettingstarted/podman-systemd-general.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ You should see events similar to those below in the output:
```ini
syslog-ng checking config
sc4s version=v1.36.0
Configuring health check port: 8080
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
starting syslog-ng
```

Expand Down
1 change: 1 addition & 0 deletions package/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ COPY package/etc/local_config /etc/syslog-ng/local_config
COPY package/etc/local_config /etc/syslog-ng/local_config
COPY package/sbin/entrypoint.sh /
COPY package/sbin/healthcheck.sh /
COPY package/sbin/healthcheck.py /
COPY package/sbin/source_ports_validator.py /

ENV SC4S_CONTAINER_OPTS=--no-caps
Expand Down
1 change: 1 addition & 0 deletions package/Dockerfile.lite
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ COPY package/lite/etc/addons /etc/syslog-ng/addons

COPY package/sbin/entrypoint.sh /
COPY package/sbin/healthcheck.sh /
COPY package/sbin/healthcheck.py /
COPY package/sbin/source_ports_validator.py /


Expand Down
3 changes: 3 additions & 0 deletions package/sbin/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ echo sc4s version=$(cat $SC4S_ETC/VERSION)
echo sc4s version=$(cat $SC4S_ETC/VERSION) >>$SC4S_VAR/log/syslog-ng.out
$SC4S_SBIN/syslog-ng --no-caps $SC4S_CONTAINER_OPTS -s >>$SC4S_VAR/log/syslog-ng.out 2>$SC4S_VAR/log/syslog-ng.err

echo "Configuring the health check port to: $SC4S_LISTEN_STATUS_PORT"
nohup gunicorn -b 0.0.0.0:$SC4S_LISTEN_STATUS_PORT healthcheck:app &

# OPTIONAL for BYOE: Comment out/remove all remaining lines and launch syslog-ng directly from systemd
if [ "${SC4S_DEBUG_CONTAINER}" == "yes" ]
then
Expand Down
113 changes: 113 additions & 0 deletions package/sbin/healthcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from flask import Flask, jsonify
import logging
import os
import subprocess

app = Flask(__name__)

def str_to_bool(value):
return str(value).strip().lower() in {
'true',
'1',
't',
'y',
'yes'
}

class Config:
SC4S_DEST_SPLUNK_HEC_DEFAULT_URL = os.getenv('SC4S_DEST_SPLUNK_HEC_DEFAULT_URL')
HEALTHCHECK_PORT = int(os.getenv('SC4S_LISTEN_STATUS_PORT', '8080'))
CHECK_QUEUE_SIZE = str_to_bool(os.getenv('HEALTHCHECK_CHECK_QUEUE_SIZE', "false"))
MAX_QUEUE_SIZE = int(os.getenv('HEALTHCHECK_MAX_QUEUE_SIZE', '10000'))

logging.basicConfig(
format=f"%(asctime)s - healthcheck.py - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

def check_syslog_ng_health() -> bool:
"""Check the health of the syslog-ng process."""
try:
result = subprocess.run(
['syslog-ng-ctl', 'healthcheck', '-t', '1'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
return True

logger.error(f"syslog-ng healthcheck failed: {result.stderr.strip()}")
return False
except subprocess.TimeoutExpired:
logger.error("syslog-ng healthcheck timed out.")
return False
except Exception as e:
logger.exception(f"Unexpected error during syslog-ng healthcheck: {e}")
return False

def check_queue_size(
sc4s_dest_splunk_hec_default=Config.SC4S_DEST_SPLUNK_HEC_DEFAULT_URL,
max_queue_size=Config.MAX_QUEUE_SIZE
) -> bool:
"""Check syslog-ng queue size and compare it against the configured maximum limit."""
if not sc4s_dest_splunk_hec_default:
logger.error(
"SC4S_DEST_SPLUNK_HEC_DEFAULT_URL not configured. "
"Ensure the default HEC destination is set, or disable HEALTHCHECK_CHECK_QUEUE_SIZE."
)
return False

try:
result = subprocess.run(
['syslog-ng-ctl', 'stats'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
logger.error(f"syslog-ng stats command failed: {result.stderr.strip()}")
return False

stats = result.stdout.splitlines()
destination_stat = next(
(s for s in stats if ";queued;" in s and sc4s_dest_splunk_hec_default in s),
None
)
if not destination_stat:
logger.error("No matching queue stats found for the destination URL.")
return False

queue_size = int(destination_stat.split(";")[-1])
if queue_size > max_queue_size:
logger.warning(
f"Queue size {queue_size} exceeds the maximum limit of {max_queue_size}."
)
return False

return True
except subprocess.TimeoutExpired:
logger.error("syslog-ng stats command timed out.")
return False
except Exception as e:
logger.exception(f"Unexpected error checking queue size: {e}")
return False

@app.route('/health', methods=['GET'])
def healthcheck():
if Config.CHECK_QUEUE_SIZE:
if not check_syslog_ng_health():
return jsonify({'status': 'unhealthy: syslog-ng healthcheck failed'}), 503
if not check_queue_size():
return jsonify({'status': 'unhealthy: queue size exceeded limit'}), 503
else:
if not check_syslog_ng_health():
return jsonify({'status': 'unhealthy: syslog-ng healthcheck failed'}), 503

logger.info("Service is healthy.")
return jsonify({'status': 'healthy'}), 200


if __name__ == '__main__':
app.run(host='0.0.0.0', port=Config.HEALTHCHECK_PORT)
Loading
Loading