diff --git a/Dockerfile b/Dockerfile index 033ac5e..9d3707a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,17 +37,34 @@ RUN go build -ldflags="-s -w" -trimpath -o ./dist/manifold . FROM debian:bullseye-slim ENV JAEGER_ENDPOINT=http://0.0.0.0:16686 - -# Install necessary certificates -RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/* +ENV DEBIAN_FRONTEND=noninteractive + +# Install necessary packages +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + bash \ + ca-certificates \ + curl \ + wget && \ + # Install yq for YAML processing + # yq is used to process the config file + wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 && \ + chmod +x /usr/local/bin/yq && \ + # Cleanup + apt-get autoremove -y && \ + apt-get clean WORKDIR /app # Copy the built binary from stage 2 COPY --from=backend-builder /manifold/dist/manifold /app/ -COPY config.yaml /app/ +# Copy the tokenized config file and processor script +COPY config.yaml.example /app/ +COPY process_config.sh /app/ +RUN chmod +x /app/process_config.sh EXPOSE 8080 -CMD ["/app/manifold"] +# Process config and start the application +CMD ["/bin/bash", "-c", "/app/process_config.sh && /app/manifold"] diff --git a/config.yaml.example b/config.yaml.example index 740eb4a..efb8bea 100644 --- a/config.yaml.example +++ b/config.yaml.example @@ -1,86 +1,147 @@ -# Manifold Example Configuration +# =================================================================== +# Manifold Configuration with Environment Variable Support +# =================================================================== +# +# This configuration file contains default values for Manifold. +# All settings can be overridden at runtime using environment variables. +# +# ENVIRONMENT VARIABLE MAPPING: +# ---------------------------- +# Environment variables are automatically mapped to YAML paths using this convention: +# +# 1. Prefix all variables with "MANIFOLD__" +# 2. Use UPPERCASE for all letters +# 3. Use DOUBLE underscore (__) to separate YAML hierarchy levels +# 4. Use SINGLE underscore (_) for keys containing underscores +# +# Examples: +# MANIFOLD__HOST=api.example.com → host: 'api.example.com' +# MANIFOLD__PORT=9000 → port: 9000 +# MANIFOLD__DATABASE__CONNECTION_STRING=... → database.connection_string: '...' +# MANIFOLD__MCPSERVERS__GITHUB__COMMAND=docker → mcpServers.github.command: 'docker' +# MANIFOLD__COMPLETIONS__DEFAULT_HOST=http://... → completions.default_host: 'http://...' +# +# VALUE TYPES: +# ----------- +# Different value types are automatically handled as below: +# +# - Numbers: MANIFOLD__PORT=8080 → port: 8080 +# - Booleans: MANIFOLD__SINGLE_NODE_INSTANCE=false → single_node_instance: false +# - Strings: MANIFOLD__HOST=localhost → host: 'localhost' +# - Null: MANIFOLD__HF_TOKEN=null → hf_token: null +# - JSON arrays: MANIFOLD__MCPSERVERS__GITHUB__ARGS='["run","--rm"]' +# → mcpservers.github.args: ["run","--rm"] +# - JSON objects: MANIFOLD__SOME__CONFIG='{"key":"value"}' +# → some.config: {"key":"value"} +# +# HOW IT WORKS: +# ------------ +# At container startup, the process_config.sh script: +# 1. Copies this file to config.yaml +# 2. Finds all MANIFOLD__* environment variables +# 3. Maps them to their corresponding YAML paths +# 4. Updates the config.yaml file accordingly +# +# =================================================================== -# Manifold Host +# =================================================================== +# SERVER CONFIGURATION +# =================================================================== + +# Server address and port host: 'localhost' port: 8080 -# Manifold storage path: models, database files, etc -data_path: '/Users/yourusername/.manifold' # REPLACE with your actual path +# Storage path for models, database files, and other persistent data +data_path: '/data' + +# =================================================================== +# RUNTIME CONFIGURATION +# =================================================================== -# Set to true to automatically run llama-server instances for embeddings, reranker, and completions -# This enables the http://localhost:32186/v1/chat/completions endpoint running the gemma-3-4b-it model +# When enabled, Manifold automatically runs llama-server instances for: +# - embeddings (port 32184) +# - reranker (port 32185) +# - completions (port 32186) single_node_instance: true -# Database Configuration (PGVector) +# =================================================================== +# DATABASE CONFIGURATION +# =================================================================== + database: - connection_string: "postgres://pgadmin:yourpassword@localhost:5432/manifold?sslmode=disable" # REPLACE with your actual credentials + # PostgreSQL connection string with PGVector extension + # Format: postgres://username:password@hostname:port/database?sslmode=disable + connection_string: "" + +# =================================================================== +# API TOKENS +# =================================================================== -# HuggingFace Token -hf_token: "..." +# HuggingFace API token for accessing gated models +hf_token: "" # Google Gemini API token -google_gemini_key: "..." +google_gemini_key: "" -# Anthropic API token -anthropic_key: "..." +# Anthropic API token (Claude models) +anthropic_key: "" -# The completions, embeddings, and reranker services are automatically bootstrapped by Manifold in the ports defined below. -# -# - Completions Service: Handles the generation of text completions based on input prompts. -# - Embeddings Service: Manages the creation of vector representations for text data. -# - Reranker Service: Reorders search results to improve relevance based on certain criteria. -# -# Each service is configured with default settings, but can be customized as needed. -# Users can also set remote hosts for these services if they are running on different machines. +# =================================================================== +# LLM SERVICES CONFIGURATION +# =================================================================== -# Example CLI command for running the completions service manually: -# llama-server -m /models/gguf/gemma-3-4b-it.Q8_0.gguf --temp 1.0 --ctx-size 16384 --min-p 0.01 --top-p 0.95 --top-k 64 --repeat-penalty 1.0 -t -1 -ngl 99 --parallel 4 --batch-size 2048 --ubatch-size 512 --threads-http 4 -fa --host 127.0.0.1 --port 32186 --props -# -# Default Completions Configuration - using local gemma-3-4b-it model +# Completions Service Configuration +# Handles generation of text completions based on input prompts completions: - default_host: "http://127.0.0.1:32186/v1/chat/completions" # or https://api.openai.com/v1/chat/completions - completions_model: 'gpt-4o' # ignored if using local endpoint - api_key: "" # Used with OpenAI API if configured as default host + # OpenAI-compatible API endpoint + default_host: "http://127.0.0.1:32186/v1/chat/completions" + # Model identifier to use for completions + completions_model: 'gpt-4o' + # API key for the completions service (if required) + api_key: "" -# Example CLI command for running the embeddings service manually: -# llama-server -m /models/embeddings/nomic-embed-text-v1.5.Q8_0.gguf -c 65536 -np 8 -b 8192 -ub 8192 -fa --host 127.0.0.1 --port 32184 -lv 1 --embedding -# -# Embeddings API Configuration -# Using local nomic-embed-text-v1.5 -# The initialize process will automatically download and start the model at port 32184 +# Embeddings Service Configuration +# Manages the creation of vector representations for text data embeddings: + # OpenAI-compatible API endpoint host: "http://127.0.0.1:32184/v1/embeddings" - # OpenAI API compatible API key, not required for local servers unless configured on that server + # API key for the embeddings service (if required) api_key: "" - dimensions: 768 # Size of embedding dimensions + # Vector dimensions for the embedding model + dimensions: 768 + # Prefix added to document text before embedding embed_prefix: "search_document: " + # Prefix added to query text before embedding search_prefix: "search_query: " -# Example CLI command for running the reranker service manually: -# llama-server -m /models/rerankers/slide-bge-reranker-v2-m3.Q4_K_M.gguf -c 65536 -np 8 -b 8192 -ub 8192 -fa --host 127.0.0.1 --port 32185 -lv 1 --reranking --pooling rank -# -# Reranker using local slide-bge-reranker-v2-m3 -# The initialize process will automatically download and start the model at port 32185 +# Reranker Service Configuration +# Reorders search results to improve relevance reranker: + # OpenAI-compatible API endpoint host: "http://127.0.0.1:32185/v1/rerank" -# List of external MCP servers -# Example GitHub MCP server +# =================================================================== +# MCP SERVERS CONFIGURATION +# =================================================================== + +# MCP (Model Control Protocol) servers provide Manifold with data access mcpServers: - github: - command: docker - args: - - run - - -i - - --rm - - -e - - GITHUB_PERSONAL_ACCESS_TOKEN - - ghcr.io/github/github-mcp-server - env: - GITHUB_PERSONAL_ACCESS_TOKEN: "" - - # Manifold's internal MCP server + # Manifold's built-in MCP server manifold: command: ./cmd/mcp-manifold/mcp-manifold args: [] - env: {} \ No newline at end of file + env: {} + + # Example GitHub MCP server (commented by default) + # github: + # command: docker + # args: + # - run + # - -i + # - --rm + # - -e + # - GITHUB_PERSONAL_ACCESS_TOKEN + # - ghcr.io/github/github-mcp-server + # env: + # GITHUB_PERSONAL_ACCESS_TOKEN: "" \ No newline at end of file diff --git a/process_config.sh b/process_config.sh new file mode 100644 index 0000000..a850794 --- /dev/null +++ b/process_config.sh @@ -0,0 +1,89 @@ +#!/bin/bash +set -e + +CONFIG_TEMPLATE="./config.yaml.example" +CONFIG_OUTPUT="./config.yaml" + +# Check if yq is installed +if ! command -v yq &> /dev/null; then + echo "yq could not be found, but is required to run this script." + exit 1 +fi + +# Copy the template to start with +cp "$CONFIG_TEMPLATE" "$CONFIG_OUTPUT" + +# Extract all paths from the YAML file with their original casing +echo "Building YAML path dictionary..." +declare -A path_map +while IFS= read -r path; do + if [ ! -z "$path" ]; then + # Store both formats - with dots and with underscores + path_lower=$(echo "$path" | tr '[:upper:]' '[:lower:]') + path_map["$path_lower"]="$path" + fi +done < <(yq eval '.. | path | select(length > 0) | join(".")' "$CONFIG_TEMPLATE") + +# Debug: Print all paths found +echo "Found paths in YAML:" +for path in "${!path_map[@]}"; do + echo " $path -> ${path_map[$path]}" +done + +# Process environment variables prefixed with MANIFOLD__ +for var in $(env | grep ^MANIFOLD__ | cut -d= -f1); do + # Remove MANIFOLD__ prefix + key_without_prefix=$(echo "$var" | sed 's/^MANIFOLD__//') + + # Convert double underscore to dots for nested paths + env_path=$(echo "$key_without_prefix" | tr '[:upper:]' '[:lower:]' | sed 's/__/./g') + + # Check if this exact path exists in our dictionary + yaml_path="" + if [ -n "${path_map[$env_path]}" ]; then + # Direct match found + yaml_path="${path_map[$env_path]}" + echo "Direct match found: $env_path -> $yaml_path" + else + # No direct match, try fuzzy matching + for path_key in "${!path_map[@]}"; do + # Compare normalized versions (all lowercase, no underscores vs dots) + path_norm=$(echo "$path_key" | sed 's/\./_/g') + env_norm=$(echo "$env_path" | sed 's/\./_/g') + + if [ "$path_norm" = "$env_norm" ]; then + yaml_path="${path_map[$path_key]}" + echo "Fuzzy match found: $env_path -> $yaml_path (normalized: $path_norm)" + break + fi + done + fi + + # If no match found, use the normalized path + if [ -z "$yaml_path" ]; then + yaml_path="$env_path" + echo "Warning: No match found for $var, using $yaml_path" + fi + + # Get environment variable value + value="${!var}" + echo "Setting $yaml_path = $value" + + # Special handling for arrays/objects + if [[ "$value" == \[* ]] || [[ "$value" == \{* ]]; then + # Handle as JSON - assumed to be valid JSON + yq -i ".$yaml_path = $value" "$CONFIG_OUTPUT" + else + # Handle as scalar value + # Properly quote strings if needed + if [[ "$value" =~ ^[0-9]+$ ]] || [[ "$value" == "true" ]] || [[ "$value" == "false" ]] || [[ "$value" == "null" ]]; then + # Numeric or boolean values don't need quotes + yq -i ".$yaml_path = $value" "$CONFIG_OUTPUT" + else + # String values need quotes + yq -i ".$yaml_path = \"$value\"" "$CONFIG_OUTPUT" + fi + fi +done + +echo "Config file processed successfully!" \ No newline at end of file