|
22 | 22 | TogetherRequest,
|
23 | 23 | TrainingType,
|
24 | 24 | FinetuneLRScheduler,
|
| 25 | + FinetuneLinearLRScheduler, |
| 26 | + FinetuneCosineLRScheduler, |
25 | 27 | FinetuneLinearLRSchedulerArgs,
|
| 28 | + FinetuneCosineLRSchedulerArgs, |
26 | 29 | TrainingMethodDPO,
|
27 | 30 | TrainingMethodSFT,
|
28 | 31 | FinetuneCheckpoint,
|
@@ -57,7 +60,9 @@ def createFinetuneRequest(
|
57 | 60 | n_checkpoints: int | None = 1,
|
58 | 61 | batch_size: int | Literal["max"] = "max",
|
59 | 62 | learning_rate: float | None = 0.00001,
|
| 63 | + lr_scheduler_type: Literal["linear", "cosine"] = "linear", |
60 | 64 | min_lr_ratio: float = 0.0,
|
| 65 | + scheduler_num_cycles: float = 0.5, |
61 | 66 | warmup_ratio: float = 0.0,
|
62 | 67 | max_grad_norm: float = 1.0,
|
63 | 68 | weight_decay: float = 0.0,
|
@@ -134,10 +139,22 @@ def createFinetuneRequest(
|
134 | 139 | f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}"
|
135 | 140 | )
|
136 | 141 |
|
137 |
| - lrScheduler = FinetuneLRScheduler( |
138 |
| - lr_scheduler_type="linear", |
139 |
| - lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), |
140 |
| - ) |
| 142 | + # Default to generic lr scheduler |
| 143 | + lrScheduler: FinetuneLRScheduler = FinetuneLRScheduler(lr_scheduler_type="linear") |
| 144 | + |
| 145 | + if lr_scheduler_type == "cosine": |
| 146 | + if scheduler_num_cycles <= 0.0: |
| 147 | + raise ValueError("Number of cycles should be greater than 0") |
| 148 | + |
| 149 | + lrScheduler = FinetuneCosineLRScheduler( |
| 150 | + lr_scheduler_args=FinetuneCosineLRSchedulerArgs( |
| 151 | + min_lr_ratio=min_lr_ratio, num_cycles=scheduler_num_cycles |
| 152 | + ), |
| 153 | + ) |
| 154 | + else: |
| 155 | + lrScheduler = FinetuneLinearLRScheduler( |
| 156 | + lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), |
| 157 | + ) |
141 | 158 |
|
142 | 159 | training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT()
|
143 | 160 | if training_method == "dpo":
|
@@ -249,7 +266,9 @@ def create(
|
249 | 266 | n_checkpoints: int | None = 1,
|
250 | 267 | batch_size: int | Literal["max"] = "max",
|
251 | 268 | learning_rate: float | None = 0.00001,
|
| 269 | + lr_scheduler_type: Literal["linear", "cosine"] = "linear", |
252 | 270 | min_lr_ratio: float = 0.0,
|
| 271 | + scheduler_num_cycles: float = 0.5, |
253 | 272 | warmup_ratio: float = 0.0,
|
254 | 273 | max_grad_norm: float = 1.0,
|
255 | 274 | weight_decay: float = 0.0,
|
@@ -284,9 +303,11 @@ def create(
|
284 | 303 | batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
|
285 | 304 | learning_rate (float, optional): Learning rate multiplier to use for training
|
286 | 305 | Defaults to 0.00001.
|
| 306 | + lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". |
287 | 307 | min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
|
288 | 308 | the learning rate scheduler. Defaults to 0.0.
|
289 |
| - warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. |
| 309 | + scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. |
| 310 | + warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. |
290 | 311 | max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
|
291 | 312 | weight_decay (float, optional): Weight decay. Defaults to 0.0.
|
292 | 313 | lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
|
@@ -353,7 +374,9 @@ def create(
|
353 | 374 | n_checkpoints=n_checkpoints,
|
354 | 375 | batch_size=batch_size,
|
355 | 376 | learning_rate=learning_rate,
|
| 377 | + lr_scheduler_type=lr_scheduler_type, |
356 | 378 | min_lr_ratio=min_lr_ratio,
|
| 379 | + scheduler_num_cycles=scheduler_num_cycles, |
357 | 380 | warmup_ratio=warmup_ratio,
|
358 | 381 | max_grad_norm=max_grad_norm,
|
359 | 382 | weight_decay=weight_decay,
|
@@ -634,7 +657,9 @@ async def create(
|
634 | 657 | n_checkpoints: int | None = 1,
|
635 | 658 | batch_size: int | Literal["max"] = "max",
|
636 | 659 | learning_rate: float | None = 0.00001,
|
| 660 | + lr_scheduler_type: Literal["linear", "cosine"] = "linear", |
637 | 661 | min_lr_ratio: float = 0.0,
|
| 662 | + scheduler_num_cycles: float = 0.5, |
638 | 663 | warmup_ratio: float = 0.0,
|
639 | 664 | max_grad_norm: float = 1.0,
|
640 | 665 | weight_decay: float = 0.0,
|
@@ -669,9 +694,11 @@ async def create(
|
669 | 694 | batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
|
670 | 695 | learning_rate (float, optional): Learning rate multiplier to use for training
|
671 | 696 | Defaults to 0.00001.
|
| 697 | + lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". |
672 | 698 | min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
|
673 | 699 | the learning rate scheduler. Defaults to 0.0.
|
674 |
| - warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. |
| 700 | + scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. |
| 701 | + warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. |
675 | 702 | max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
|
676 | 703 | weight_decay (float, optional): Weight decay. Defaults to 0.0.
|
677 | 704 | lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
|
@@ -738,7 +765,9 @@ async def create(
|
738 | 765 | n_checkpoints=n_checkpoints,
|
739 | 766 | batch_size=batch_size,
|
740 | 767 | learning_rate=learning_rate,
|
| 768 | + lr_scheduler_type=lr_scheduler_type, |
741 | 769 | min_lr_ratio=min_lr_ratio,
|
| 770 | + scheduler_num_cycles=scheduler_num_cycles, |
742 | 771 | warmup_ratio=warmup_ratio,
|
743 | 772 | max_grad_norm=max_grad_norm,
|
744 | 773 | weight_decay=weight_decay,
|
|
0 commit comments