config_sample.yml

# Options for networking
network:
  # The IP to host on (default: 127.0.0.1).
  # Use 0.0.0.0 to expose on all network adapters.
  host: 127.0.0.1

  # The port to host on (default: 5000).
  # Note: Recommended to use 5001 on MacOS because AirServer runs on port 5000
  port: 5000

  # Disable HTTP token authentication with requests.
  # WARNING: This will make your instance vulnerable!
  # Turn on this option if you are ONLY connecting from localhost.
  disable_auth: false

# Options for logging
logging:
  # Enable prompt logging (default: False)
  log_prompt: false

  # Enable generation parameter logging (default: False)
  log_generation_params: false

# Options for model overrides and loading
model:
  # Directory to look for models (default: models).
  # Windows users, do NOT put this path in quotes!
  model_dir: models

  # An initial model to load.
  # Make sure the model is located in the model directory!
  # REQUIRED: This must be filled out to load a model on startup.
  model_name:

  # Max sequence length (default: Empty).
  # Fetched from the model's base sequence length in config.json by default.
  max_seq_len:

  # Number of model layers to offload on the GPU (default: 0)
  # Set this to 999 to offload all layers to the GPU
  num_gpu_layers: 0

  # An integer array of GBs of VRAM to split between GPUs (default: []).
  # Going over the max amount of GPUs will crash when loading the model
  gpu_split: []

  # Prompt template to use for chat completions (default: None)
  prompt_template:

  # Enable flash attention (default: false)
  flash_attention: false

  # Rope freq base. 0 = model default (default: 0)
  # Adjust this value for NTK scaling
  rope_freq_base: 0

  # Enable YaRN scaling. All other parameters inherited from the model (default: 0)
  # Turning this on disables linear/NTK RoPE scaling
  enable_yarn: false

  # K cache quantization type (default: F16)
  # Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
  cache_mode_k: f16

  # V cache quantization type (default: F16)
  # Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
  cache_mode_v: f16

# Options for Sampling
sampling:
  # Select a sampler override preset (default: None).
  # Find this in the sampler_overrides folder.
  # This overrides default fallbacks for sampler values that are passed to the API.
  override_preset: