Skip to content

Support overriding the config.yaml with environment variables at runtime (e.g. in Docker) #57

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,34 @@ RUN go build -ldflags="-s -w" -trimpath -o ./dist/manifold .
FROM debian:bullseye-slim

ENV JAEGER_ENDPOINT=http://0.0.0.0:16686

# Install necessary certificates
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/*
ENV DEBIAN_FRONTEND=noninteractive

# Install necessary packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
wget && \
# Install yq for YAML processing
# yq is used to process the config file
wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 && \
chmod +x /usr/local/bin/yq && \
# Cleanup
apt-get autoremove -y && \
apt-get clean

WORKDIR /app

# Copy the built binary from stage 2
COPY --from=backend-builder /manifold/dist/manifold /app/

COPY config.yaml /app/
# Copy the tokenized config file and processor script
COPY config.yaml.example /app/
COPY process_config.sh /app/
RUN chmod +x /app/process_config.sh

EXPOSE 8080

CMD ["/app/manifold"]
# Process config and start the application
CMD ["/bin/bash", "-c", "/app/process_config.sh && /app/manifold"]
175 changes: 118 additions & 57 deletions config.yaml.example
Original file line number Diff line number Diff line change
@@ -1,86 +1,147 @@
# Manifold Example Configuration
# ===================================================================
# Manifold Configuration with Environment Variable Support
# ===================================================================
#
# This configuration file contains default values for Manifold.
# All settings can be overridden at runtime using environment variables.
#
# ENVIRONMENT VARIABLE MAPPING:
# ----------------------------
# Environment variables are automatically mapped to YAML paths using this convention:
#
# 1. Prefix all variables with "MANIFOLD__"
# 2. Use UPPERCASE for all letters
# 3. Use DOUBLE underscore (__) to separate YAML hierarchy levels
# 4. Use SINGLE underscore (_) for keys containing underscores
#
# Examples:
# MANIFOLD__HOST=api.example.com → host: 'api.example.com'
# MANIFOLD__PORT=9000 → port: 9000
# MANIFOLD__DATABASE__CONNECTION_STRING=... → database.connection_string: '...'
# MANIFOLD__MCPSERVERS__GITHUB__COMMAND=docker → mcpServers.github.command: 'docker'
# MANIFOLD__COMPLETIONS__DEFAULT_HOST=http://... → completions.default_host: 'http://...'
#
# VALUE TYPES:
# -----------
# Different value types are automatically handled as below:
#
# - Numbers: MANIFOLD__PORT=8080 → port: 8080
# - Booleans: MANIFOLD__SINGLE_NODE_INSTANCE=false → single_node_instance: false
# - Strings: MANIFOLD__HOST=localhost → host: 'localhost'
# - Null: MANIFOLD__HF_TOKEN=null → hf_token: null
# - JSON arrays: MANIFOLD__MCPSERVERS__GITHUB__ARGS='["run","--rm"]'
# → mcpservers.github.args: ["run","--rm"]
# - JSON objects: MANIFOLD__SOME__CONFIG='{"key":"value"}'
# → some.config: {"key":"value"}
#
# HOW IT WORKS:
# ------------
# At container startup, the process_config.sh script:
# 1. Copies this file to config.yaml
# 2. Finds all MANIFOLD__* environment variables
# 3. Maps them to their corresponding YAML paths
# 4. Updates the config.yaml file accordingly
#
# ===================================================================

# Manifold Host
# ===================================================================
# SERVER CONFIGURATION
# ===================================================================

# Server address and port
host: 'localhost'
port: 8080

# Manifold storage path: models, database files, etc
data_path: '/Users/yourusername/.manifold' # REPLACE with your actual path
# Storage path for models, database files, and other persistent data
data_path: '/data'

# ===================================================================
# RUNTIME CONFIGURATION
# ===================================================================

# Set to true to automatically run llama-server instances for embeddings, reranker, and completions
# This enables the http://localhost:32186/v1/chat/completions endpoint running the gemma-3-4b-it model
# When enabled, Manifold automatically runs llama-server instances for:
# - embeddings (port 32184)
# - reranker (port 32185)
# - completions (port 32186)
single_node_instance: true

# Database Configuration (PGVector)
# ===================================================================
# DATABASE CONFIGURATION
# ===================================================================

database:
connection_string: "postgres://pgadmin:yourpassword@localhost:5432/manifold?sslmode=disable" # REPLACE with your actual credentials
# PostgreSQL connection string with PGVector extension
# Format: postgres://username:password@hostname:port/database?sslmode=disable
connection_string: ""

# ===================================================================
# API TOKENS
# ===================================================================

# HuggingFace Token
hf_token: "..."
# HuggingFace API token for accessing gated models
hf_token: ""

# Google Gemini API token
google_gemini_key: "..."
google_gemini_key: ""

# Anthropic API token
anthropic_key: "..."
# Anthropic API token (Claude models)
anthropic_key: ""

# The completions, embeddings, and reranker services are automatically bootstrapped by Manifold in the ports defined below.
#
# - Completions Service: Handles the generation of text completions based on input prompts.
# - Embeddings Service: Manages the creation of vector representations for text data.
# - Reranker Service: Reorders search results to improve relevance based on certain criteria.
#
# Each service is configured with default settings, but can be customized as needed.
# Users can also set remote hosts for these services if they are running on different machines.
# ===================================================================
# LLM SERVICES CONFIGURATION
# ===================================================================

# Example CLI command for running the completions service manually:
# llama-server -m <data_path>/models/gguf/gemma-3-4b-it.Q8_0.gguf --temp 1.0 --ctx-size 16384 --min-p 0.01 --top-p 0.95 --top-k 64 --repeat-penalty 1.0 -t -1 -ngl 99 --parallel 4 --batch-size 2048 --ubatch-size 512 --threads-http 4 -fa --host 127.0.0.1 --port 32186 --props
#
# Default Completions Configuration - using local gemma-3-4b-it model
# Completions Service Configuration
# Handles generation of text completions based on input prompts
completions:
default_host: "http://127.0.0.1:32186/v1/chat/completions" # or https://api.openai.com/v1/chat/completions
completions_model: 'gpt-4o' # ignored if using local endpoint
api_key: "" # Used with OpenAI API if configured as default host
# OpenAI-compatible API endpoint
default_host: "http://127.0.0.1:32186/v1/chat/completions"
# Model identifier to use for completions
completions_model: 'gpt-4o'
# API key for the completions service (if required)
api_key: ""

# Example CLI command for running the embeddings service manually:
# llama-server -m <data_path>/models/embeddings/nomic-embed-text-v1.5.Q8_0.gguf -c 65536 -np 8 -b 8192 -ub 8192 -fa --host 127.0.0.1 --port 32184 -lv 1 --embedding
#
# Embeddings API Configuration
# Using local nomic-embed-text-v1.5
# The initialize process will automatically download and start the model at port 32184
# Embeddings Service Configuration
# Manages the creation of vector representations for text data
embeddings:
# OpenAI-compatible API endpoint
host: "http://127.0.0.1:32184/v1/embeddings"
# OpenAI API compatible API key, not required for local servers unless configured on that server
# API key for the embeddings service (if required)
api_key: ""
dimensions: 768 # Size of embedding dimensions
# Vector dimensions for the embedding model
dimensions: 768
# Prefix added to document text before embedding
embed_prefix: "search_document: "
# Prefix added to query text before embedding
search_prefix: "search_query: "

# Example CLI command for running the reranker service manually:
# llama-server -m <data_path>/models/rerankers/slide-bge-reranker-v2-m3.Q4_K_M.gguf -c 65536 -np 8 -b 8192 -ub 8192 -fa --host 127.0.0.1 --port 32185 -lv 1 --reranking --pooling rank
#
# Reranker using local slide-bge-reranker-v2-m3
# The initialize process will automatically download and start the model at port 32185
# Reranker Service Configuration
# Reorders search results to improve relevance
reranker:
# OpenAI-compatible API endpoint
host: "http://127.0.0.1:32185/v1/rerank"

# List of external MCP servers
# Example GitHub MCP server
# ===================================================================
# MCP SERVERS CONFIGURATION
# ===================================================================

# MCP (Model Control Protocol) servers provide Manifold with data access
mcpServers:
github:
command: docker
args:
- run
- -i
- --rm
- -e
- GITHUB_PERSONAL_ACCESS_TOKEN
- ghcr.io/github/github-mcp-server
env:
GITHUB_PERSONAL_ACCESS_TOKEN: "<YOUR_TOKEN>"

# Manifold's internal MCP server
# Manifold's built-in MCP server
manifold:
command: ./cmd/mcp-manifold/mcp-manifold
args: []
env: {}
env: {}

# Example GitHub MCP server (commented by default)
# github:
# command: docker
# args:
# - run
# - -i
# - --rm
# - -e
# - GITHUB_PERSONAL_ACCESS_TOKEN
# - ghcr.io/github/github-mcp-server
# env:
# GITHUB_PERSONAL_ACCESS_TOKEN: ""
89 changes: 89 additions & 0 deletions process_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash
set -e

CONFIG_TEMPLATE="./config.yaml.example"
CONFIG_OUTPUT="./config.yaml"

# Check if yq is installed
if ! command -v yq &> /dev/null; then
echo "yq could not be found, but is required to run this script."
exit 1
fi

# Copy the template to start with
cp "$CONFIG_TEMPLATE" "$CONFIG_OUTPUT"

# Extract all paths from the YAML file with their original casing
echo "Building YAML path dictionary..."
declare -A path_map
while IFS= read -r path; do
if [ ! -z "$path" ]; then
# Store both formats - with dots and with underscores
path_lower=$(echo "$path" | tr '[:upper:]' '[:lower:]')
path_map["$path_lower"]="$path"
fi
done < <(yq eval '.. | path | select(length > 0) | join(".")' "$CONFIG_TEMPLATE")

# Debug: Print all paths found
echo "Found paths in YAML:"
for path in "${!path_map[@]}"; do
echo " $path -> ${path_map[$path]}"
done

# Process environment variables prefixed with MANIFOLD__
for var in $(env | grep ^MANIFOLD__ | cut -d= -f1); do
# Remove MANIFOLD__ prefix
key_without_prefix=$(echo "$var" | sed 's/^MANIFOLD__//')

# Convert double underscore to dots for nested paths
env_path=$(echo "$key_without_prefix" | tr '[:upper:]' '[:lower:]' | sed 's/__/./g')

# Check if this exact path exists in our dictionary
yaml_path=""
if [ -n "${path_map[$env_path]}" ]; then
# Direct match found
yaml_path="${path_map[$env_path]}"
echo "Direct match found: $env_path -> $yaml_path"
else
# No direct match, try fuzzy matching
for path_key in "${!path_map[@]}"; do
# Compare normalized versions (all lowercase, no underscores vs dots)
path_norm=$(echo "$path_key" | sed 's/\./_/g')
env_norm=$(echo "$env_path" | sed 's/\./_/g')

if [ "$path_norm" = "$env_norm" ]; then
yaml_path="${path_map[$path_key]}"
echo "Fuzzy match found: $env_path -> $yaml_path (normalized: $path_norm)"
break
fi
done
fi

# If no match found, use the normalized path
if [ -z "$yaml_path" ]; then
yaml_path="$env_path"
echo "Warning: No match found for $var, using $yaml_path"
fi

# Get environment variable value
value="${!var}"
echo "Setting $yaml_path = $value"

# Special handling for arrays/objects
if [[ "$value" == \[* ]] || [[ "$value" == \{* ]]; then
# Handle as JSON - assumed to be valid JSON
yq -i ".$yaml_path = $value" "$CONFIG_OUTPUT"
else
# Handle as scalar value
# Properly quote strings if needed
if [[ "$value" =~ ^[0-9]+$ ]] || [[ "$value" == "true" ]] || [[ "$value" == "false" ]] || [[ "$value" == "null" ]]; then
# Numeric or boolean values don't need quotes
yq -i ".$yaml_path = $value" "$CONFIG_OUTPUT"
else
# String values need quotes
yq -i ".$yaml_path = \"$value\"" "$CONFIG_OUTPUT"
fi
fi
done

echo "Config file processed successfully!"