diff --git a/docs/zh/llm/devices/intel_hpu/tests/README.md b/docs/zh/llm/devices/intel_hpu/tests/README.md new file mode 120000 index 000000000000..be7740d62f9c --- /dev/null +++ b/docs/zh/llm/devices/intel_hpu/tests/README.md @@ -0,0 +1 @@ +../../../../../../llm/devices/intel_hpu/tests/README.md \ No newline at end of file diff --git a/docs/zh/llm/docs/pretrain.md b/docs/zh/llm/docs/pretrain.md deleted file mode 100644 index deb15e9b8335..000000000000 --- a/docs/zh/llm/docs/pretrain.md +++ /dev/null @@ -1 +0,0 @@ -../../../../llm/docs/pretrain.md diff --git a/docs/zh/llm/docs/pretrain.md b/docs/zh/llm/docs/pretrain.md new file mode 120000 index 000000000000..d194a5436713 --- /dev/null +++ b/docs/zh/llm/docs/pretrain.md @@ -0,0 +1 @@ +../../../../llm/docs/pretrain.md \ No newline at end of file diff --git a/llm/alignment/rl/README.md b/llm/alignment/rl/README.md index 3f31df529f71..8ec2238b0c76 100644 --- a/llm/alignment/rl/README.md +++ b/llm/alignment/rl/README.md @@ -203,6 +203,7 @@ export FLAGS_cascade_attention_max_partition_size=2048 python -u -m paddle.distributed.launch --devices "0,1,2,3" run_rl.py ../../config/qwen/reinforce_plus_plus_argument.yaml ``` +我们提供根据上述脚本可复现的[wandb 日志](https://api.wandb.ai/links/ainlp66-netflix/injcw3ra)。 ### 在线监控 在`grpo_argument.yaml`和`reinforce_plus_plus_argument.yaml`中设置的输出目录为`"logging_dir": "vdl_log"`, 可以通过以下命令查看训练过程 diff --git a/llm/config/qwen/grpo_32b_argument.yaml b/llm/config/qwen/grpo_32b_argument.yaml index 743f995d8c32..0b9259c2b844 100644 --- a/llm/config/qwen/grpo_32b_argument.yaml +++ b/llm/config/qwen/grpo_32b_argument.yaml @@ -74,7 +74,7 @@ disable_tqdm: true # Whether to disable tqdm progress bar # RL args kl_coeff: 0.001 # KL coefficient for PPO and Reinforce++ -kl_loss_coeff: 0.001 # KL loss coefficient +kl_loss_coeff: 0.000 # KL loss coefficient pg_loss_coeff: 1.0 # Policy gradient loss coefficient entropy_coeff: 0.0 # Entropy coefficient clip_range_ratio: 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm) diff --git a/llm/config/qwen/grpo_argument.yaml b/llm/config/qwen/grpo_argument.yaml index bfb761ee5a5f..cd106746d951 100644 --- a/llm/config/qwen/grpo_argument.yaml +++ b/llm/config/qwen/grpo_argument.yaml @@ -74,7 +74,7 @@ disable_tqdm: true # Whether to disable tqdm progress bar # RL args kl_coeff: 0.001 # KL coefficient for PPO and Reinforce++ -kl_loss_coeff: 0.001 # KL loss coefficient +kl_loss_coeff: 0.000 # KL loss coefficient pg_loss_coeff: 1.0 # Policy gradient loss coefficient entropy_coeff: 0.0 # Entropy coefficient clip_range_ratio: 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm) diff --git a/paddlenlp/rl/trainer/ppo_trainer.py b/paddlenlp/rl/trainer/ppo_trainer.py index 0f36cd380905..762517705908 100644 --- a/paddlenlp/rl/trainer/ppo_trainer.py +++ b/paddlenlp/rl/trainer/ppo_trainer.py @@ -1526,7 +1526,7 @@ def train( if self.args.rl_algorithm == "ppo": batch["reward_values"] = self.critic_trainer.compute_value(**batch) - # danamic sampling: filter generated samples by rewards, keep generating until valid samples are enough + # dynamic sampling: filter generated samples by rewards, keep generating until valid samples are enough if self.args.dynamic_sampling: local_valid_prompt = 0 # combined_batch = combine_micro_batches_into_batch(micro_batches, pad_token_id=self.tokenizer.pad_token_id) @@ -1601,7 +1601,7 @@ def train( total_batch = defaultdict(list) total_valid_prompt = 0 num_gen_batches = 0 - logger.info("Danymic sampling completed. \n") + logger.info("Dynamic sampling completed. \n") else: if self.args.max_gen_batches > 0 and num_gen_batches > self.args.max_gen_batches: @@ -1664,7 +1664,7 @@ def train( paddle.device.cuda.empty_cache() if self.args.rl_algorithm == "ppo": - rl_info["train_value_loss"] = self.critic_trainer.update_critc(micro_batch) + rl_info["train_value_loss"] = self.critic_trainer.update_critic(micro_batch) if self.is_step_end(): self.state.global_step += 1 self.state.epoch = epoch + (step + 1) / steps_in_epoch @@ -1701,7 +1701,6 @@ def train( if self.control.should_training_stop: break - # TODO(guosheng): add epilogue of training logger.info("\nTraining completed. \n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: if args.local_rank != -1: