-
Notifications
You must be signed in to change notification settings - Fork 162
/
Copy pathconfig.py
139 lines (104 loc) · 5.39 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from dataclasses import dataclass
from os.path import dirname, abspath
# replace '\' on windows to '/'
PROJECT_ROOT: str = '/'.join(abspath(dirname(__file__)).split('\\')) if '\\' in abspath(dirname(__file__)) else abspath(dirname(__file__))
# ===================================================================================
# 以下为推断的配置
@dataclass
class InferConfig:
max_seq_len: int = 320 # 回答的最大长度
mixed_precision: str = "bf16" # 混合精度 ''no','fp16','bf16' or 'fp8'
# 全量DPO模型文件, tokenizer文件和model权重放在同一个文件夹
model_dir: str = PROJECT_ROOT + '/model_save/'
# lora PDO 合并后的模型文件
# model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.best.dpo.lora_merged.bin'
# this confing for api demo:
api_key: str = ""
host: str = '127.0.0.1'
port: int = 8812
reload: bool = True
workers: int = 1
log_level: str = 'info'
#===================================================================================
# 以下为dpo训练配置
@dataclass
class DpoConfig:
max_seq_len: int = 512 + 8 # 8 for eos token
sft_model_file: str = PROJECT_ROOT + '/model_save/'
tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹
dpo_train_file: str = PROJECT_ROOT + '/data/my_dpo_data.json'
dpo_eval_file: str = PROJECT_ROOT + '/data/my_dpo_eval.json'
adapter_file: str = PROJECT_ROOT + '/data/dpo/adapter_model.safetensors'
log_dir: str = PROJECT_ROOT + '/logs/'
per_device_train_batch_size: int = 4
num_train_epochs: int = 4
gradient_accumulation_steps: int = 8
learning_rate: float = 1e-5
logging_first_step: bool = True
logging_steps: int = 20
save_steps: int = 2000
output_dir: str = PROJECT_ROOT + '/model_save/dpo'
warmup_steps: int = 1000
fp16: bool = True
seed: int = 23333
beta: float = 0.1
# 以下为sft配置
@dataclass
class SFTconfig:
max_seq_len: int = 384 + 8 # 8 for eos token
finetune_from_ckp_file = PROJECT_ROOT + '/model_save/'
tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹
sft_train_file: str = PROJECT_ROOT + '/data/sft_train.json'
batch_size: int = 12
num_train_epochs: int = 4
save_steps: int = 5000
gradient_accumulation_steps: int = 4
learning_rate: float = 1e-5
logging_first_step: bool = True
logging_steps: int = 100
output_dir: str = PROJECT_ROOT + '/model_save/sft'
warmup_steps: int = 100
fp16: bool = True
seed: int = 23333
# ===================================================================================
# 以下为训练的配置
@dataclass
class TrainConfig:
epochs: int = 8
batch_size_per_gpu: int = 16
learn_rate: float = 0.0001 # 最大 div_factor * learn_rate
div_factor: int = 50
mixed_precision: str = "bf16" # 混合精度 ''no','fp16','bf16' or 'fp8'
# 注意:计算梯度时相当于batch_size * gradient_accumulation_steps,说人话就是梯度累积步数>1时,等于增大n倍的batch_size
gradient_accumulation_steps: int = 8 # 累积梯度更新步数
warmup_steps: int = 1024 # 模型参数预热步数,预热样本数=warmup_steps * batch_size * gradient_accumulation_steps
tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹
model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.{}.bin'
model_config_file: str = PROJECT_ROOT + '/model_save/model_config.json'
train_file: str = PROJECT_ROOT + '/data/my_train_dataset.parquet'
validation_file: str = PROJECT_ROOT + '/data/my_valid_dataset.parquet'
test_file: str = PROJECT_ROOT + '/data/my_test_dataset.parquet'
# 从哪个模型开始微调,仅当traing 函数 is_finetune = True时生效
# 微调记得冻结某些层或者调低学习率
finetune_from_ckp_file = PROJECT_ROOT + '/model_save/chat_small_t5.best.bin'
# 训练状态保存,中断后可以从此处继续训练
train_state_dir: str = PROJECT_ROOT + '/model_save/train_latest_state'
output_dir: str = PROJECT_ROOT + '/model_save/pretrain'
logging_steps: int = 50
save_steps: int = 10000
# dataset_cache_dir: str = PROJECT_ROOT + '/data/.cache'
# trainer_log_file: str = PROJECT_ROOT + '/logs/trainer.log'
keep_latest_n_ckp: int = 8 # 训练过程中,最多保留多少个分数最好的模型文件
seed: int = 23333
dataloader_buffer_size: int = 50000
max_seq_len: int = 256 # 最大句子长度,默认:256
#======================================================================================
# 以下为模型的配置
@dataclass
class T5ModelConfig:
d_ff: int = 3072 # 全连接层维度
d_model: int = 768 # 词向量维度
num_heads: int = 12 # 注意力头数 d_model // num_heads == d_kv
d_kv: int = 64 # d_model // num_heads
num_decoder_layers: int = 10 # Transformer decoder 隐藏层层数
num_layers: int = 10 # Transformer encoder 隐藏层层数