From 5b6b9e288bc90b5d9e63ac562fee25fbf9f14d31 Mon Sep 17 00:00:00 2001 From: Brent George Date: Thu, 10 Oct 2024 10:17:55 -0400 Subject: [PATCH 1/5] add diagnostic tools for deployments --- diagnostics/README.md | 40 ++++++ diagnostics/dg_log_parser.sh | 154 ++++++++++++++++++++++++ diagnostics/dg_validate_nvidia_setup.sh | 102 ++++++++++++++++ 3 files changed, 296 insertions(+) create mode 100644 diagnostics/README.md create mode 100755 diagnostics/dg_log_parser.sh create mode 100644 diagnostics/dg_validate_nvidia_setup.sh diff --git a/diagnostics/README.md b/diagnostics/README.md new file mode 100644 index 0000000..d580dab --- /dev/null +++ b/diagnostics/README.md @@ -0,0 +1,40 @@ +# Diagnostics + +This directory contains a collection of tools and scripts designed to help validate, monitor, and troubleshoot the deployment of Deepgram's self-hosted product. + +## Usage + +For detailed usage instructions and features of each script, please refer to the header comments within the respective script files. + +## Contents +### 1. [dg_validate_nvidia_setup.sh](./dg_validate_nvidia_setup.sh) + +This script verifies the GPU environment and container runtime setup for Deepgram self-hosted products running with Docker or Podman. + +### 2. [dg_log_parser.sh](./dg_log_parser.sh) +This script analyzes log files from Deepgram self-hosted containers to identify common issues and provide troubleshooting suggestions. + +Collecting log files for analysis will vary depending on your container orchestrator: + +#### Docker +```bash +```bash +docker ps # Note the container ID of the relevant Deepgram container +docker logs > dg_container.log 2>&1 +``` +#### Podman +```bash +```bash +podman ps # Note the container ID of the relevant Deepgram container +podman logs > dg_container.log 2>&1 +``` +#### Kubernetes +```bash +```bash +kubectl get pods -n # Note the name of the Pod containing the relevant Deepgram container +kubectl logs > dg_container.log 2>&1 +``` + +## Getting Help + +See the [Getting Help section](../README.md#getting-help) of the repo README. diff --git a/diagnostics/dg_log_parser.sh b/diagnostics/dg_log_parser.sh new file mode 100755 index 0000000..f9652ba --- /dev/null +++ b/diagnostics/dg_log_parser.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# +# This script analyzes log files from Deepgram self-hosted containers to identify common +# issues and provide troubleshooting suggestions. +# +# ## Usage +# This script can analyze individual container logs by passing a single file as an argument. +# Additionally, it can analyze logs from containers deployed in the same environment +# by passing each log file as a seperate argument. This can be useful for analyzing a +# paired API and Engine container. +# +# ``` +# ./dg_log_parser.sh [logfile2] [logfile3] ... +# ``` +# +# ## Supported Containers +# - API +# - Engine +# - License Proxy + +set -euo pipefail + +YELLOW='\033[0;33m' +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +usage() { + printf "Usage: %s [logfile2] [logfile3] ...\n" "$0" + exit 1 +} + +if [ $# -eq 0 ]; then + usage +fi + +check_file_errors() { + local file="$1" + local error_found=false + local container_name="Deepgram" + + if grep -q "stem::config:" "$file"; then + container_name="API" + elif grep -q "impeller::config:" "$file"; then + container_name="Engine" + elif grep -q "hermes::config:" "$file"; then + container_name="Hermes" + fi + + if grep -q "Configuration file not found at .* Falling back to default/bundled configuration" "$file"; then + printf "%bWarning%b: Using default configuration for %s container.\n" "$YELLOW" "$NC" "$container_name" + printf "If you intended to specify your own configuration file, ensure it is being properly mounted to the container.\n" + fi + + if grep -q "Missing license configuration" "$file"; then + printf "%bError%b: Missing API key for %s container.\n" "$RED" "$NC" "$container_name" + printf "Suggested fix: Ensure that the environment variable \`DEEPGRAM_API_KEY\` is set within the container (usually via your Compose file or Helm chart).\n" + error_found=true + fi + + if grep -qE "^.*Aegis request to .* failed.*$" "$file"; then + local target_url + target_url=$(grep -oE "Aegis request to [^ ]+ failed" "$file" | head -n1 | cut -d' ' -f4) + printf "%bError%b: Connection issue detected for %s container. Unable to connect/authenticate with License Server via %s\n" \ + "$RED" "$NC" "$container_name" "$target_url" + + if grep -qE "^.*Aegis request to .* failed:.*dns error.*$" "$file"; then + printf "Suggested fix: Check DNS resolution for the target service.\n" + elif grep -qE "^.*Aegis request to .* failed.*401.*$" "$file"; then + printf "Suggested fix: Your API key is unauthorized. Check console.deepgram.com to ensure that your API key is active and has self-hosted access.\n" + elif grep -qE "^.*Aegis request to .* failed:.*[TimedOut|Connection refused].*$" "$file"; then + printf "Suggested fix: " + if [[ "$target_url" =~ ^.*license.deepgram.com.*$ ]]; then + printf "Verify egress traffic to license.deepgram.com is allow-listed by your firewall, and check network connectivity for your container.\n" + else + printf "Verify the License Proxy container is running and healthy\n" + fi + fi + + error_found=true + fi + + if grep -q "impeller::config: Using devices: CPU" "$file"; then + printf "%bWarning%b: Engine container was unable to detect a GPU, and is running in CPU mode.\n" "$YELLOW" "$NC" + printf "CPU mode is significantly less efficient than using a GPU. If not intended, ensure all GPU setup steps have been completed from the Deepgram developer documentation.\n" + error_found=true + elif grep -q "half_precision=false" "$file"; then + printf "%bWarning%b: GPU not running in half precision mode. Inference efficiency will be significantly impacted with this setting disabled.\n" "$YELLOW" "$NC" + printf "Most modern GPUs support half precision, but auto-detection of this capability may not be working.\n" + error_found=true + fi + + if grep -q "impeller::model_suppliers::autoload: Unable to read model search path" "$file"; then + printf "%bError%b: Invalid models directory for $container_name container.\n" "$RED" "$NC" + printf "Suggested fix: Ensure that your models are mounted properly to the container.\n" + error_found=true + fi + + if grep -q "Failed to load model" "$file"; then + bad_models=$(grep -P ".*Failed to load model.*" "$file" | grep -oP 'path=\K[^}]*' | sort -u) + printf "%bWARNING%b: Some models could not be loaded by the $container_name container.\n" "$YELLOW" "$NC" + printf "Suggested fix: Check each of the following files for corrupt downloads, and verify the model was delivered for the same project that issued your self-hosted API key.\n" + for model in $bad_models; do + printf " - %s\n" "$model" + done + error_found=true + fi + + $error_found +} + +analyze_logs() { + local log_files=("$@") + local error_found=false + + # Check each file individually for errors + for file in "${log_files[@]}"; do + if check_file_errors "$file"; then + error_found=true + fi + done + + local temp_error_file + temp_error_file=$(mktemp) + local engine_listening=false + echo "false" >"$temp_error_file" + sort -k1 --stable "${log_files[@]}" | while IFS= read -r line; do + if [[ $line =~ ^.*INFO\ impeller:\ Listening\ on\ http.*$ ]]; then + engine_listening=true + fi + + if [[ "$engine_listening" = true ]] && [[ $line =~ ^.*WARN\ impeller_info:\ stem::utils::impeller_info_actor:\ Unable\ to\ get\ model\ info\ from\ Engine\ with\ any\ drivers.*$ ]]; then + printf "%bError%b: The API container was unable to connect to the Engine container, even after the Engine container successfully started.\n" "$RED" "$NC" + printf "Suggested fix: Check your composition files, api.toml, and engine.toml files to ensure networking between the containers is configured correctly.\n" + echo "true" >"$temp_error_file" + break + fi + done + + if [[ $(cat "$temp_error_file") == "true" ]]; then + error_found=true + fi + rm "$temp_error_file" + + if [ "$error_found" = false ]; then + printf "%bNo problems detected from provided log files.%b \ + If something is wrong with your deployment, there may be a different error that is not detected by this initial script. \ + Contact your Deepgram Account Representative for further assistance.\n" \ + "$GREEN" "$NC" + fi + +} + +analyze_logs "$@" diff --git a/diagnostics/dg_validate_nvidia_setup.sh b/diagnostics/dg_validate_nvidia_setup.sh new file mode 100644 index 0000000..223db99 --- /dev/null +++ b/diagnostics/dg_validate_nvidia_setup.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# +# This script verifies the GPU environment and container runtime setup for Deepgram self-hosted products running with Docker or Podman. +# It performs a series of checks to ensure that your system is properly configured to run GPU-accelerated container workloads. +# +# This script supports Ubuntu (using dpkg) and RHEL-based distributions (using dnf). +# +# ## Usage +# Run this script with root privileges: +# ``` +# sudo ./dg_validate_nvidia_setup.sh +# ``` + +# Function to display error messages in red +error() { + printf "\033[0;31m%s\033[0m\n" "$1" +} + +# Function to display success messages in green +success() { + printf "\033[0;32m%s\033[0m\n" "$1" +} + +direct_to_documentation() { + doc_string=$'For details, see the Deepgram Self-Hosted documentation at:\n\t'"$1" + error "$doc_string" +} + +# Detect the package manager (dpkg for Ubuntu, dnf for RHEL-based distros) +if command -v dpkg &>/dev/null; then + package_manager="dpkg -s" +elif command -v dnf &>/dev/null; then + package_manager="dnf list installed" +else + error "Unsupported package manager. This script supports Ubuntu (dpkg) and RHEL-based distros (dnf)." + exit 1 +fi + +# Check if NVIDIA drivers are installed correctly +if lsmod | grep -q nouveau; then + error "Issue: Nouveau drivers are installed instead of NVIDIA drivers." + error "Please install the correct NVIDIA drivers and blacklist the Nouveau drivers." + direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#remove-nouveau-drivers" + exit 1 +elif ! nvidia-smi &>/dev/null; then + error "Issue: NVIDIA drivers are not installed correctly or are corrupt." + error "Please reinstall the NVIDIA drivers and ensure they are functioning properly." + direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-nvidia-drivers" + exit 1 +else + success "NVIDIA drivers are installed correctly." +fi + +# Check if NVIDIA driver version is compatible with most recent Deepgram self-hosted release +MINIMUM_DRIVER_VERSION="530.30.02" +nvidia_driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader) +if [[ "$(printf '%s\n' "$nvidia_driver_version" "$MINIMUM_DRIVER_VERSION" | sort -V | head -n1)" != "$MINIMUM_DRIVER_VERSION" ]]; then + error "Issue: The installed NVIDIA driver version is not compatible with the most recent Deepgram self-hosted release." + error "Please install a driver on version $MINIMUM_DRIVER_VERSION+." + direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#download-and-install-the-official-drivers" + exit 1 +else + success "NVIDIA driver version is compatible with the most recent Deepgram self-hosted release." +fi + +# Check if NVIDIA container runtime is installed +if ! $package_manager nvidia-container-toolkit &>/dev/null; then + error "Issue: NVIDIA container toolkit is not installed." + error "Please install the NVIDIA container toolkit to enable GPU support in containers." + direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-the-nvidia-container-runtime" + exit 1 +else + success "NVIDIA container runtime is installed." +fi + +if which docker &>/dev/null; then + # Check if NVIDIA container runtime is configured with Docker + if ! grep -q "nvidia" /etc/docker/daemon.json 2>/dev/null; then + error "Issue: NVIDIA container runtime is not configured with Docker." + error "Please run the **Configuration** step for the 'nvidia-container-runtime'." + direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#docker-1" + exit 1 + fi +elif which podman &>/dev/null; then + # Check if NVIDIA container runtime is configured with CDI for Podman + CDI_SPEC_FILE="/etc/cdi/nvidia.yaml" + + if [ ! -f "$CDI_SPEC_FILE" ] || [ ! -r "$CDI_SPEC_FILE" ] || [ ! -s "$CDI_SPEC_FILE" ]; then + error "Issue: NVIDIA container runtime is not configured with Podman." + error "Please run the **Configuration** step for the 'nvidia-container-runtime'." + direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#podman-1" + exit 1 + fi +else + error "Did not detect 'docker' or 'podman' container engines." + error "This script currently only supports these two approaches." + direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-container-engine" + exit 1 +fi +success "NVIDIA container runtime is configured properly." + +success $'\nYour instance appears to be ready to run GPU container workloads, such as Deepgram self-hosted products.' From c8473542f4ff8e10b9e8f69379830ac677f5d89b Mon Sep 17 00:00:00 2001 From: Brent George Date: Thu, 10 Oct 2024 10:28:24 -0400 Subject: [PATCH 2/5] add ci check for bash scripts --- .github/workflows/ci.yml | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 77cda7c..a3ced65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,3 +75,47 @@ jobs: exit 1 fi fi + + diagnostic-scripts-check: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Check for changes in diagnostic directory + id: check_changes + run: | + git diff --name-only origin/${{ github.base_ref }}..HEAD -- diagnostics/*.sh > changed_files.txt + if [ -s changed_files.txt ]; then + echo "changes_detected=true" >> $GITHUB_OUTPUT + else + echo "changes_detected=false" >> $GITHUB_OUTPUT + fi + - name: Install ShellCheck + if: steps.check_changes.outputs.changes_detected == 'true' + run: sudo apt-get install -y shellcheck + - name: Install shfmt + if: steps.check_changes.outputs.changes_detected == 'true' + run: | + go install mvdan.cc/sh/v3/cmd/shfmt@latest + echo "$HOME/go/bin" >> $GITHUB_PATH + - name: Run ShellCheck + if: steps.check_changes.outputs.changes_detected == 'true' + run: | + while IFS= read -r file; do + if [[ "$file" == *.sh ]]; then + shellcheck "$file" + fi + done < changed_files.txt + - name: Run shfmt + if: steps.check_changes.outputs.changes_detected == 'true' + run: | + while IFS= read -r file; do + if [[ "$file" == *.sh ]]; then + if ! shfmt -d "$file"; then + echo "Error: $file is not correctly formatted. Run 'shfmt -w $file' to fix." + exit 1 + fi + fi + done < changed_files.txt From dc53c0feb0558fe2f91dd690032d66b7abde07ab Mon Sep 17 00:00:00 2001 From: Brent George Date: Thu, 10 Oct 2024 10:30:33 -0400 Subject: [PATCH 3/5] update top-level README to include diagnostics dir --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 08668fa..809bcf9 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Official resources for running [Deepgram](https://deepgram.com) in a [self-hoste * [Helm Chart](charts/deepgram-self-hosted/README.md) for Kubernetes deployments * [Docker Compose Files](./docker/README.md) for deploying with Docker * [Podman Compose Files](./podman/README.md) for deploying with Podman +* [Diagnostic](./diagnostics/README.md) tools and scripts for troubleshooting deployments ## Documentation From e00bca61c181cf34ccac25455b332ac9d2798996 Mon Sep 17 00:00:00 2001 From: Brent George Date: Thu, 10 Oct 2024 16:11:30 -0400 Subject: [PATCH 4/5] remove redundant shell openers --- diagnostics/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/diagnostics/README.md b/diagnostics/README.md index d580dab..c2753f8 100644 --- a/diagnostics/README.md +++ b/diagnostics/README.md @@ -18,19 +18,16 @@ Collecting log files for analysis will vary depending on your container orchestr #### Docker ```bash -```bash docker ps # Note the container ID of the relevant Deepgram container docker logs > dg_container.log 2>&1 ``` #### Podman ```bash -```bash podman ps # Note the container ID of the relevant Deepgram container podman logs > dg_container.log 2>&1 ``` #### Kubernetes ```bash -```bash kubectl get pods -n # Note the name of the Pod containing the relevant Deepgram container kubectl logs > dg_container.log 2>&1 ``` From 7764c5d263fe4879bc05c478517a9ac52e56dfe7 Mon Sep 17 00:00:00 2001 From: Brent George Date: Thu, 10 Oct 2024 16:16:30 -0400 Subject: [PATCH 5/5] clarify log warnings --- diagnostics/dg_log_parser.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/diagnostics/dg_log_parser.sh b/diagnostics/dg_log_parser.sh index f9652ba..b241db4 100755 --- a/diagnostics/dg_log_parser.sh +++ b/diagnostics/dg_log_parser.sh @@ -68,6 +68,7 @@ check_file_errors() { printf "Suggested fix: Check DNS resolution for the target service.\n" elif grep -qE "^.*Aegis request to .* failed.*401.*$" "$file"; then printf "Suggested fix: Your API key is unauthorized. Check console.deepgram.com to ensure that your API key is active and has self-hosted access.\n" + printf "See https://developers.deepgram.com/docs/self-hosted-self-service-tutorial for details" elif grep -qE "^.*Aegis request to .* failed:.*[TimedOut|Connection refused].*$" "$file"; then printf "Suggested fix: " if [[ "$target_url" =~ ^.*license.deepgram.com.*$ ]]; then @@ -82,7 +83,7 @@ check_file_errors() { if grep -q "impeller::config: Using devices: CPU" "$file"; then printf "%bWarning%b: Engine container was unable to detect a GPU, and is running in CPU mode.\n" "$YELLOW" "$NC" - printf "CPU mode is significantly less efficient than using a GPU. If not intended, ensure all GPU setup steps have been completed from the Deepgram developer documentation.\n" + printf "CPU mode is critically less efficient than using a GPU, and likely not intended. Ensure all GPU setup steps have been completed from the Deepgram developer documentation.\n" error_found=true elif grep -q "half_precision=false" "$file"; then printf "%bWarning%b: GPU not running in half precision mode. Inference efficiency will be significantly impacted with this setting disabled.\n" "$YELLOW" "$NC"