Learning Rate scheduler #121

vinm007 · 2024-01-05T07:24:15Z

Hi,
Thanks for sharing this amazing work. I have been using this hugging face-based repo for fine-tuning - https://github.com/huggingface/instruction-tuned-sd but I have some queries regarding the learning rate scheduler.
In the paper, it was mentioned - We use a learning rate of 10−4 (without any learning rate warm up).
Is it constant or some scheduler is used, I do see some config in train.yaml but can't understand

    scheduler_config: # 10000 warmup steps
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps: [ 0 ]
        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
        f_start: [ 1.e-6 ]
        f_max: [ 1. ]
        f_min: [ 1. ]

If it's being used, what would be the equivalent of this scheduler on diffusers.

The text was updated successfully, but these errors were encountered:

eastchun · 2024-10-05T16:52:24Z

Hi,

This is so-called a LambdaLinearScheduler.
With the above params setting (with warm_up_steps: [0]), learning rate is just constant value of 1e-6.

If warm_up_steps is given by a value other than [0] (say [2500]), then this setting generates the constant learning rate of 1e-5 with warm-up period of epoch 0 to 2500 (linearly increasing from 1e-6 to 1e-5) and constant learning rate of 1e-5 all the way to the end.

Please refer to the following code snippet:

`### LambdaLinearScheduler.py

Running command:

CUDA_VISIBLE_DEVICES=0 torchrun --master_port 10000 LambdaLinearScheduler.py

import torch
import torch.distributed as dist
from torch.distributed.optim import ZeroRedundancyOptimizer
from torch.nn.parallel import DistributedDataParallel as DDP
import argparse
import os
import numpy as np

class LambdaWarmUpCosineScheduler2:

# Supports repeated iterations, configurable via lists (Note: use with a base_lr of 1.0.)
def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):

    assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
    self.lr_warm_up_steps = warm_up_steps # [0]
    self.f_start = f_start # [1e-6]
    self.f_min = f_min # [1.0]
    self.f_max = f_max # [1.0]
    self.cycle_lengths = cycle_lengths # [10000000000000]
    self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths)) # [ 0  10000000000000]
    # [0] + list(self.cycle_lengths)): [0, 10000000000000]
    self.last_f = 0.
    self.verbosity_interval = verbosity_interval # 0

def find_in_interval(self, n):
    interval = 0
    for cl in self.cum_cycles[1:]: # cl: 10000000000000
        if n <= cl:
            return interval # 0
        interval += 1

def schedule(self, n, **kwargs):

    cycle = self.find_in_interval(n) # 0
    n = n - self.cum_cycles[cycle] # n - 0 = n

    if self.verbosity_interval > 0: # False
        if n % self.verbosity_interval == 0: 
            print(f"current step: {n}, recent lr-multiplier: {self.last_f}, current cycle {cycle}")

    if n < self.lr_warm_up_steps[cycle]: # self.lr_warm_up_steps[cycle]: 0 -> False
        f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
        self.last_f = f
        return f

    else:
        t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
        # (n - 0)/(10000000000000 - 0) = n/10000000000000
        t = min(t, 1.0)
        f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (1 + np.cos(t * np.pi))
        # 1.0 + 0.5*0 = 1.0
        self.last_f = f # 1.0
        return f

def __call__(self, n, **kwargs):
    return self.schedule(n, **kwargs)

class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):

def schedule(self, n, **kwargs): # kwargs: {}

    cycle = self.find_in_interval(n) # 0
    n = n - self.cum_cycles[cycle] # n - 0

    if self.verbosity_interval > 0: # False
        if n % self.verbosity_interval == 0: 
            print(f"current step: {n}, recent lr-multiplier: {self.last_f}, current cycle {cycle}")

    if n < self.lr_warm_up_steps[cycle]:
        f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
        # f = (1.0 - 1e-6) / 2500*n + 1e-6
        self.last_f = f
        return f

    else:
        f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
        # f = 1.0 + (1.0 - 1.0)*(10000000000000 - n)/10000000000000 = 1.0
        self.last_f = f
        return f

def main(args):

args.world_size = int(os.environ['WORLD_SIZE'])
args.local_rank = int(os.environ['LOCAL_RANK'])
args.rank = int(os.environ['RANK'])
args.device = torch.device("cuda", args.local_rank)

dist.init_process_group("nccl", rank=args.rank, world_size=args.world_size) # ('nccl', 0, 1)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True


global_step = 0
model = torch.nn.Linear(10,1)
model.to(args.local_rank) 

torch.cuda.set_device(args.local_rank)
model = DDP(model, 
            device_ids=[args.local_rank],  # [0]
            output_device=args.local_rank, # 0
            broadcast_buffers=False,
            bucket_cap_mb=128,
            find_unused_parameters=True,
            gradient_as_bucket_view=True # This will save memory
)

optimizer = ZeroRedundancyOptimizer(
     params = model.parameters(), 
     lr=args.lr, # 1e-05
     optimizer_class=torch.optim.AdamW,
     weight_decay=args.weight_decay, # 0
     )

lr_scheduler = LambdaLinearScheduler(
                            warm_up_steps=[args.lr_anneal_steps], # [0] (note. 2500 for resuming. 10000 if starting from scratch)
                            cycle_lengths=[10000000000000], # Incredibly large number to prevent corner cases
                            f_start=[1.e-6], 
                            f_max=[1.], 
                            f_min=[1.]
                            )

lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_scheduler.schedule)
lr_scheduler.step(global_step)


lr_list = []
for itr in range(global_step, args.num_train_steps): 

    lr_scheduler.step()
    optimizer.consolidate_state_dict(to=0) # We need to consolidate optimizer state to cuda:0 to call print function below
    lr = optimizer.state_dict()['param_groups'][0]['lr']
    #print('lr:', lr)
    lr_list.append(lr)
    global_step += 1

with open(r'/mnt/c/dreambyte/MagicPose/exercise/lr_list.txt', 'w') as fp:
    fp.write("\n".join(str(item) for item in lr_list))
    fp.close()

import matplotlib.pyplot as plt
xs=[x for x in range(len(lr_list))]
fig = plt.figure()
plt.plot(xs, lr_list)
plt.savefig('lr_list.png', dpi = fig.dpi)

if name == "main":

str2bool = lambda arg: bool(int(arg))
parser = argparse.ArgumentParser(description='Control Net training')

parser.add_argument('--num_train_steps', type = int, default = 20000, help='number of train steps (1000000)')
parser.add_argument('--lr', type = float, default = 1e-05, help='initial lr value')
parser.add_argument('--weight_decay', type = float, default = 0, help='weight decay (L2) regularization')
#parser.add_argument('--lr_anneal_steps', type = float, default = 0, help='steps for learning rate annealing')
parser.add_argument('--lr_anneal_steps', type = float, default = 10000, help='steps for learning rate annealing')
# 2500 for resume, 10000 for start from scratch
args = parser.parse_args()
main(args)`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Learning Rate scheduler #121

Learning Rate scheduler #121

vinm007 commented Jan 5, 2024

eastchun commented Oct 5, 2024 •

edited

Loading

Learning Rate scheduler #121

Learning Rate scheduler #121

Comments

vinm007 commented Jan 5, 2024

eastchun commented Oct 5, 2024 • edited Loading

Running command:

CUDA_VISIBLE_DEVICES=0 torchrun --master_port 10000 LambdaLinearScheduler.py

eastchun commented Oct 5, 2024 •

edited

Loading