-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathconfig_sample.yml
74 lines (58 loc) · 2.3 KB
/
config_sample.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Options for networking
network:
# The IP to host on (default: 127.0.0.1).
# Use 0.0.0.0 to expose on all network adapters.
host: 127.0.0.1
# The port to host on (default: 5000).
# Note: Recommended to use 5001 on MacOS because AirServer runs on port 5000
port: 5000
# Disable HTTP token authentication with requests.
# WARNING: This will make your instance vulnerable!
# Turn on this option if you are ONLY connecting from localhost.
disable_auth: false
# Options for logging
logging:
# Enable prompt logging (default: False)
log_prompt: false
# Enable generation parameter logging (default: False)
log_generation_params: false
# Options for model overrides and loading
model:
# Directory to look for models (default: models).
# Windows users, do NOT put this path in quotes!
model_dir: models
# An initial model to load.
# Make sure the model is located in the model directory!
# REQUIRED: This must be filled out to load a model on startup.
model_name:
# Max sequence length (default: Empty).
# Fetched from the model's base sequence length in config.json by default.
max_seq_len:
# Number of model layers to offload on the GPU (default: 0)
# Set this to 999 to offload all layers to the GPU
num_gpu_layers: 0
# An integer array of GBs of VRAM to split between GPUs (default: []).
# Going over the max amount of GPUs will crash when loading the model
gpu_split: []
# Prompt template to use for chat completions (default: None)
prompt_template:
# Enable flash attention (default: false)
flash_attention: false
# Rope freq base. 0 = model default (default: 0)
# Adjust this value for NTK scaling
rope_freq_base: 0
# Enable YaRN scaling. All other parameters inherited from the model (default: 0)
# Turning this on disables linear/NTK RoPE scaling
enable_yarn: false
# K cache quantization type (default: F16)
# Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
cache_mode_k: f16
# V cache quantization type (default: F16)
# Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
cache_mode_v: f16
# Options for Sampling
sampling:
# Select a sampler override preset (default: None).
# Find this in the sampler_overrides folder.
# This overrides default fallbacks for sampler values that are passed to the API.
override_preset: