[RL] Fix typo and add wandb log (#10641)

gongel · web-flow · commit 7dbba9807096 · 2025-05-23T14:11:06.000+08:00
* add log

* add INFERENCE_TRUNCATED_RETURN_EOS

* add INFERENCE_TRUNCATED_RETURN_EOS
diff --git a/docs/zh/llm/devices/intel_hpu/tests/README.md b/docs/zh/llm/devices/intel_hpu/tests/README.md
@@ -0,0 +1 @@
+../../../../../../llm/devices/intel_hpu/tests/README.md
diff --git a/docs/zh/llm/docs/pretrain.md b/docs/zh/llm/docs/pretrain.md
diff --git a/docs/zh/llm/docs/pretrain.md b/docs/zh/llm/docs/pretrain.md
@@ -0,0 +1 @@
+../../../../llm/docs/pretrain.md
diff --git a/llm/alignment/rl/README.md b/llm/alignment/rl/README.md
@@ -203,6 +203,7 @@ export FLAGS_cascade_attention_max_partition_size=2048
 
 python -u -m paddle.distributed.launch --devices "0,1,2,3" run_rl.py ../../config/qwen/reinforce_plus_plus_argument.yaml
 ```
+我们提供根据上述脚本可复现的[wandb 日志](https://api.wandb.ai/links/ainlp66-netflix/injcw3ra)。
 
 ### 在线监控
 在`grpo_argument.yaml`和`reinforce_plus_plus_argument.yaml`中设置的输出目录为`"logging_dir": "vdl_log"`, 可以通过以下命令查看训练过程
diff --git a/llm/config/qwen/grpo_32b_argument.yaml b/llm/config/qwen/grpo_32b_argument.yaml
@@ -74,7 +74,7 @@ disable_tqdm: true # Whether to disable tqdm progress bar
 
 # RL args
 kl_coeff: 0.001 # KL coefficient for PPO and Reinforce++
-kl_loss_coeff: 0.001 # KL loss coefficient
+kl_loss_coeff: 0.000 # KL loss coefficient
 pg_loss_coeff: 1.0 # Policy gradient loss coefficient
 entropy_coeff: 0.0 # Entropy coefficient
 clip_range_ratio: 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
diff --git a/llm/config/qwen/grpo_argument.yaml b/llm/config/qwen/grpo_argument.yaml
@@ -74,7 +74,7 @@ disable_tqdm: true # Whether to disable tqdm progress bar
 
 # RL args
 kl_coeff: 0.001 # KL coefficient for PPO and Reinforce++
-kl_loss_coeff: 0.001 # KL loss coefficient
+kl_loss_coeff: 0.000 # KL loss coefficient
 pg_loss_coeff: 1.0 # Policy gradient loss coefficient
 entropy_coeff: 0.0 # Entropy coefficient
 clip_range_ratio: 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
diff --git a/paddlenlp/rl/trainer/ppo_trainer.py b/paddlenlp/rl/trainer/ppo_trainer.py
@@ -1526,7 +1526,7 @@ def train(
                             if self.args.rl_algorithm == "ppo":
                                 batch["reward_values"] = self.critic_trainer.compute_value(**batch)
 
-                # danamic sampling: filter generated samples by rewards, keep generating until valid samples are enough
+                # dynamic sampling: filter generated samples by rewards, keep generating until valid samples are enough
                 if self.args.dynamic_sampling:
                     local_valid_prompt = 0
                     # combined_batch = combine_micro_batches_into_batch(micro_batches, pad_token_id=self.tokenizer.pad_token_id)
@@ -1601,7 +1601,7 @@ def train(
                         total_batch = defaultdict(list)
                         total_valid_prompt = 0
                         num_gen_batches = 0
-                        logger.info("Danymic sampling completed. \n")
+                        logger.info("Dynamic sampling completed. \n")
 
                     else:
                         if self.args.max_gen_batches > 0 and num_gen_batches > self.args.max_gen_batches:
@@ -1664,7 +1664,7 @@ def train(
                                 paddle.device.cuda.empty_cache()
 
                                 if self.args.rl_algorithm == "ppo":
-                                    rl_info["train_value_loss"] = self.critic_trainer.update_critc(micro_batch)
+                                    rl_info["train_value_loss"] = self.critic_trainer.update_critic(micro_batch)
                                 if self.is_step_end():
                                     self.state.global_step += 1
                                     self.state.epoch = epoch + (step + 1) / steps_in_epoch
@@ -1701,7 +1701,6 @@ def train(
 
             if self.control.should_training_stop:
                 break
-        # TODO(guosheng): add epilogue of training
         logger.info("\nTraining completed. \n")
         if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
             if args.local_rank != -1:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../../../../llm/devices/intel_hpu/tests/README.md`