intelligencedev · Art9681 · May 7, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -37,17 +37,34 @@ RUN go build -ldflags="-s -w" -trimpath -o ./dist/manifold .
 FROM debian:bullseye-slim
 
 ENV JAEGER_ENDPOINT=http://0.0.0.0:16686
-
-# Install necessary certificates
-RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/*
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install necessary packages
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        bash \
+        ca-certificates \
+        curl \
+        wget && \
+    # Install yq for YAML processing
+    # yq is used to process the config file
+    wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 && \
+    chmod +x /usr/local/bin/yq && \
+    # Cleanup
+    apt-get autoremove -y && \
+    apt-get clean
 
 WORKDIR /app
 
 # Copy the built binary from stage 2
 COPY --from=backend-builder /manifold/dist/manifold /app/
 
-COPY config.yaml /app/
+# Copy the tokenized config file and processor script
+COPY config.yaml.example /app/
+COPY process_config.sh /app/
+RUN chmod +x /app/process_config.sh
 
 EXPOSE 8080
 
-CMD ["/app/manifold"]
+# Process config and start the application
+CMD ["/bin/bash", "-c", "/app/process_config.sh && /app/manifold"]
diff --git a/config.yaml.example b/config.yaml.example
@@ -1,86 +1,147 @@
-# Manifold Example Configuration
+# ===================================================================
+# Manifold Configuration with Environment Variable Support
+# ===================================================================
+#
+# This configuration file contains default values for Manifold.
+# All settings can be overridden at runtime using environment variables.
+#
+# ENVIRONMENT VARIABLE MAPPING:
+# ----------------------------
+# Environment variables are automatically mapped to YAML paths using this convention:
+#
+# 1. Prefix all variables with "MANIFOLD__"
+# 2. Use UPPERCASE for all letters 
+# 3. Use DOUBLE underscore (__) to separate YAML hierarchy levels
+# 4. Use SINGLE underscore (_) for keys containing underscores
+#
+# Examples:
+#   MANIFOLD__HOST=api.example.com                    → host: 'api.example.com'
+#   MANIFOLD__PORT=9000                               → port: 9000
+#   MANIFOLD__DATABASE__CONNECTION_STRING=...         → database.connection_string: '...'
+#   MANIFOLD__MCPSERVERS__GITHUB__COMMAND=docker      → mcpServers.github.command: 'docker'
+#   MANIFOLD__COMPLETIONS__DEFAULT_HOST=http://...    → completions.default_host: 'http://...'
+#
+# VALUE TYPES:
+# -----------
+# Different value types are automatically handled as below:
+#
+# - Numbers:     MANIFOLD__PORT=8080                   → port: 8080
+# - Booleans:    MANIFOLD__SINGLE_NODE_INSTANCE=false  → single_node_instance: false
+# - Strings:     MANIFOLD__HOST=localhost              → host: 'localhost'
+# - Null:        MANIFOLD__HF_TOKEN=null               → hf_token: null
+# - JSON arrays: MANIFOLD__MCPSERVERS__GITHUB__ARGS='["run","--rm"]' 
+#                → mcpservers.github.args: ["run","--rm"]
+# - JSON objects: MANIFOLD__SOME__CONFIG='{"key":"value"}' 
+#                → some.config: {"key":"value"}
+#
+# HOW IT WORKS:
+# ------------
+# At container startup, the process_config.sh script:
+# 1. Copies this file to config.yaml
+# 2. Finds all MANIFOLD__* environment variables
+# 3. Maps them to their corresponding YAML paths
+# 4. Updates the config.yaml file accordingly
+#
+# ===================================================================
 
-# Manifold Host
+# ===================================================================
+# SERVER CONFIGURATION
+# ===================================================================
+
+# Server address and port
 host: 'localhost'
 port: 8080
 
-# Manifold storage path: models, database files, etc
-data_path: '/Users/yourusername/.manifold' # REPLACE with your actual path
+# Storage path for models, database files, and other persistent data
+data_path: '/data'
+
+# ===================================================================
+# RUNTIME CONFIGURATION
+# ===================================================================
 
-# Set to true to automatically run llama-server instances for embeddings, reranker, and completions
-# This enables the http://localhost:32186/v1/chat/completions endpoint running the gemma-3-4b-it model
+# When enabled, Manifold automatically runs llama-server instances for:
+# - embeddings (port 32184)
+# - reranker (port 32185)
+# - completions (port 32186)
 single_node_instance: true
 
-# Database Configuration (PGVector)
+# ===================================================================
+# DATABASE CONFIGURATION
+# ===================================================================
+
 database:
-  connection_string: "postgres://pgadmin:yourpassword@localhost:5432/manifold?sslmode=disable"  # REPLACE with your actual credentials
+  # PostgreSQL connection string with PGVector extension
+  # Format: postgres://username:password@hostname:port/database?sslmode=disable
+  connection_string: ""
+
+# ===================================================================
+# API TOKENS
+# ===================================================================
 
-# HuggingFace Token
-hf_token: "..."
+# HuggingFace API token for accessing gated models
+hf_token: ""
 
 # Google Gemini API token
-google_gemini_key: "..."
+google_gemini_key: ""
 
-# Anthropic API token
-anthropic_key: "..."
+# Anthropic API token (Claude models)
+anthropic_key: ""
 
-# The completions, embeddings, and reranker services are automatically bootstrapped by Manifold in the ports defined below.
-# 
-# - Completions Service: Handles the generation of text completions based on input prompts.
-# - Embeddings Service: Manages the creation of vector representations for text data.
-# - Reranker Service: Reorders search results to improve relevance based on certain criteria.
-#
-# Each service is configured with default settings, but can be customized as needed.
-# Users can also set remote hosts for these services if they are running on different machines.
+# ===================================================================
+# LLM SERVICES CONFIGURATION
+# ===================================================================
 
-# Example CLI command for running the completions service manually:
-# llama-server -m <data_path>/models/gguf/gemma-3-4b-it.Q8_0.gguf --temp 1.0 --ctx-size 16384 --min-p 0.01 --top-p 0.95 --top-k 64 --repeat-penalty 1.0 -t -1 -ngl 99 --parallel 4 --batch-size 2048 --ubatch-size 512 --threads-http 4 -fa --host 127.0.0.1 --port 32186 --props
-#
-# Default Completions Configuration - using local gemma-3-4b-it model
+# Completions Service Configuration
+# Handles generation of text completions based on input prompts
 completions:
-  default_host: "http://127.0.0.1:32186/v1/chat/completions" # or https://api.openai.com/v1/chat/completions
-  completions_model: 'gpt-4o' # ignored if using local endpoint
-  api_key: "" # Used with OpenAI API if configured as default host
+  # OpenAI-compatible API endpoint
+  default_host: "http://127.0.0.1:32186/v1/chat/completions"
+  # Model identifier to use for completions
+  completions_model: 'gpt-4o'
+  # API key for the completions service (if required)
+  api_key: ""
 
-# Example CLI command for running the embeddings service manually:
-# llama-server -m <data_path>/models/embeddings/nomic-embed-text-v1.5.Q8_0.gguf -c 65536 -np 8 -b 8192 -ub 8192 -fa --host 127.0.0.1 --port 32184 -lv 1 --embedding
-#
-# Embeddings API Configuration
-# Using local nomic-embed-text-v1.5
-# The initialize process will automatically download and start the model at port 32184
+# Embeddings Service Configuration
+# Manages the creation of vector representations for text data
 embeddings:
+  # OpenAI-compatible API endpoint
   host: "http://127.0.0.1:32184/v1/embeddings"
-  # OpenAI API compatible API key, not required for local servers unless configured on that server
+  # API key for the embeddings service (if required)
   api_key: ""
-  dimensions: 768 # Size of embedding dimensions
+  # Vector dimensions for the embedding model
+  dimensions: 768
+  # Prefix added to document text before embedding
   embed_prefix: "search_document: "
+  # Prefix added to query text before embedding
   search_prefix: "search_query: "
 
-# Example CLI command for running the reranker service manually:
-# llama-server -m <data_path>/models/rerankers/slide-bge-reranker-v2-m3.Q4_K_M.gguf -c 65536 -np 8 -b 8192 -ub 8192 -fa --host 127.0.0.1 --port 32185 -lv 1 --reranking --pooling rank
-#
-# Reranker using local slide-bge-reranker-v2-m3
-# The initialize process will automatically download and start the model at port 32185
+# Reranker Service Configuration
+# Reorders search results to improve relevance
 reranker:
+  # OpenAI-compatible API endpoint
   host: "http://127.0.0.1:32185/v1/rerank"
 
-# List of external MCP servers
-# Example GitHub MCP server
+# ===================================================================
+# MCP SERVERS CONFIGURATION
+# ===================================================================
+
+# MCP (Model Control Protocol) servers provide Manifold with data access
 mcpServers:
-  github:
-    command: docker
-    args:
-      - run
-      - -i
-      - --rm
-      - -e
-      - GITHUB_PERSONAL_ACCESS_TOKEN
-      - ghcr.io/github/github-mcp-server
-    env:
-      GITHUB_PERSONAL_ACCESS_TOKEN: "<YOUR_TOKEN>"
-
-  # Manifold's internal MCP server
+  # Manifold's built-in MCP server
   manifold:
     command: ./cmd/mcp-manifold/mcp-manifold
     args: []
-    env: {}
+    env: {}
+
+  # Example GitHub MCP server (commented by default)
+  # github:
+  #   command: docker
+  #   args:
+  #     - run
+  #     - -i
+  #     - --rm
+  #     - -e
+  #     - GITHUB_PERSONAL_ACCESS_TOKEN
+  #     - ghcr.io/github/github-mcp-server
+  #   env:
+  #     GITHUB_PERSONAL_ACCESS_TOKEN: ""
diff --git a/process_config.sh b/process_config.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+set -e
+
+CONFIG_TEMPLATE="./config.yaml.example"
+CONFIG_OUTPUT="./config.yaml"
+
+# Check if yq is installed
+if ! command -v yq &> /dev/null; then
+    echo "yq could not be found, but is required to run this script."
+    exit 1
+fi
+
+# Copy the template to start with
+cp "$CONFIG_TEMPLATE" "$CONFIG_OUTPUT"
+
+# Extract all paths from the YAML file with their original casing
+echo "Building YAML path dictionary..."
+declare -A path_map
+while IFS= read -r path; do
+    if [ ! -z "$path" ]; then
+        # Store both formats - with dots and with underscores
+        path_lower=$(echo "$path" | tr '[:upper:]' '[:lower:]')
+        path_map["$path_lower"]="$path"
+    fi
+done < <(yq eval '.. | path | select(length > 0) | join(".")' "$CONFIG_TEMPLATE")
+
+# Debug: Print all paths found
+echo "Found paths in YAML:"
+for path in "${!path_map[@]}"; do
+    echo "  $path -> ${path_map[$path]}"
+done
+
+# Process environment variables prefixed with MANIFOLD__
+for var in $(env | grep ^MANIFOLD__ | cut -d= -f1); do
+    # Remove MANIFOLD__ prefix
+    key_without_prefix=$(echo "$var" | sed 's/^MANIFOLD__//')
+
+    # Convert double underscore to dots for nested paths
+    env_path=$(echo "$key_without_prefix" | tr '[:upper:]' '[:lower:]' | sed 's/__/./g')
+
+    # Check if this exact path exists in our dictionary
+    yaml_path=""
+    if [ -n "${path_map[$env_path]}" ]; then
+        # Direct match found
+        yaml_path="${path_map[$env_path]}"
+        echo "Direct match found: $env_path -> $yaml_path"
+    else
+        # No direct match, try fuzzy matching
+        for path_key in "${!path_map[@]}"; do
+            # Compare normalized versions (all lowercase, no underscores vs dots)
+            path_norm=$(echo "$path_key" | sed 's/\./_/g')
+            env_norm=$(echo "$env_path" | sed 's/\./_/g')
+
+            if [ "$path_norm" = "$env_norm" ]; then
+                yaml_path="${path_map[$path_key]}"
+                echo "Fuzzy match found: $env_path -> $yaml_path (normalized: $path_norm)"
+                break
+            fi
+        done
+    fi
+
+    # If no match found, use the normalized path
+    if [ -z "$yaml_path" ]; then
+        yaml_path="$env_path"
+        echo "Warning: No match found for $var, using $yaml_path"
+    fi
+
+    # Get environment variable value
+    value="${!var}"
+    echo "Setting $yaml_path = $value"
+
+    # Special handling for arrays/objects
+    if [[ "$value" == \[* ]] || [[ "$value" == \{* ]]; then
+        # Handle as JSON - assumed to be valid JSON
+        yq -i ".$yaml_path = $value" "$CONFIG_OUTPUT"
+    else
+        # Handle as scalar value
+        # Properly quote strings if needed
+        if [[ "$value" =~ ^[0-9]+$ ]] || [[ "$value" == "true" ]] || [[ "$value" == "false" ]] || [[ "$value" == "null" ]]; then
+            # Numeric or boolean values don't need quotes
+            yq -i ".$yaml_path = $value" "$CONFIG_OUTPUT"
+        else
+            # String values need quotes
+            yq -i ".$yaml_path = \"$value\"" "$CONFIG_OUTPUT"
+        fi
+    fi
+done
+
+echo "Config file processed successfully!"