Fix qwen rl kl coeff (#10530)

fjjF77 · web-flow · commit 95901cb11b17 · 2025-04-30T17:08:44.000+08:00
diff --git a/llm/config/qwen/grpo_32b_argument.yaml b/llm/config/qwen/grpo_32b_argument.yaml
@@ -73,7 +73,7 @@ ignore_save_lr_and_optim: true # Whether to ignore saving learning rate and opti
 disable_tqdm: true # Whether to disable tqdm progress bar
 
 # RL args
-kl_coeff: 0.0 # KL coefficient
+kl_coeff: 0.001 # KL coefficient for PPO and Reinforce++
 kl_loss_coeff: 0.001 # KL loss coefficient
 pg_loss_coeff: 1.0 # Policy gradient loss coefficient
 entropy_coeff: 0.0 # Entropy coefficient
diff --git a/llm/config/qwen/grpo_argument.yaml b/llm/config/qwen/grpo_argument.yaml
@@ -73,7 +73,7 @@ ignore_save_lr_and_optim: true # Whether to ignore saving learning rate and opti
 disable_tqdm: true # Whether to disable tqdm progress bar
 
 # RL args
-kl_coeff: 0.0 # KL coefficient
+kl_coeff: 0.001 # KL coefficient for PPO and Reinforce++
 kl_loss_coeff: 0.001 # KL loss coefficient
 pg_loss_coeff: 1.0 # Policy gradient loss coefficient
 entropy_coeff: 0.0 # Entropy coefficient
diff --git a/llm/config/qwen/reinforce_plus_plus_argument.yaml b/llm/config/qwen/reinforce_plus_plus_argument.yaml
@@ -73,7 +73,7 @@ ignore_save_lr_and_optim: true # Whether to ignore saving learning rate and opti
 disable_tqdm: true # Whether to disable tqdm progress bar
 
 # RL args
-kl_coeff: 0.0 # KL coefficient
+kl_coeff: 0.001 # KL coefficient for PPO and Reinforce++
 kl_loss_coeff: 0.000 # KL loss coefficient
 pg_loss_coeff: 1.0 # Policy gradient loss coefficient
 entropy_coeff: 0.0 # Entropy coefficient