Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Learning Rate scheduler #121

Open
vinm007 opened this issue Jan 5, 2024 · 1 comment
Open

Learning Rate scheduler #121

vinm007 opened this issue Jan 5, 2024 · 1 comment

Comments

@vinm007
Copy link

vinm007 commented Jan 5, 2024

Hi,
Thanks for sharing this amazing work. I have been using this hugging face-based repo for fine-tuning - https://github.com/huggingface/instruction-tuned-sd but I have some queries regarding the learning rate scheduler.
In the paper, it was mentioned - We use a learning rate of 10−4 (without any learning rate warm up).
Is it constant or some scheduler is used, I do see some config in train.yaml but can't understand

    scheduler_config: # 10000 warmup steps
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps: [ 0 ]
        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
        f_start: [ 1.e-6 ]
        f_max: [ 1. ]
        f_min: [ 1. ]

If it's being used, what would be the equivalent of this scheduler on diffusers.

@eastchun
Copy link

eastchun commented Oct 5, 2024

Hi,

This is so-called a LambdaLinearScheduler.
With the above params setting (with warm_up_steps: [0]), learning rate is just constant value of 1e-6.

If warm_up_steps is given by a value other than [0] (say [2500]), then this setting generates the constant learning rate of 1e-5 with warm-up period of epoch 0 to 2500 (linearly increasing from 1e-6 to 1e-5) and constant learning rate of 1e-5 all the way to the end.

Please refer to the following code snippet:

`### LambdaLinearScheduler.py

Running command:

CUDA_VISIBLE_DEVICES=0 torchrun --master_port 10000 LambdaLinearScheduler.py

import torch
import torch.distributed as dist
from torch.distributed.optim import ZeroRedundancyOptimizer
from torch.nn.parallel import DistributedDataParallel as DDP
import argparse
import os
import numpy as np

class LambdaWarmUpCosineScheduler2:

# Supports repeated iterations, configurable via lists (Note: use with a base_lr of 1.0.)
def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):

    assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
    self.lr_warm_up_steps = warm_up_steps # [0]
    self.f_start = f_start # [1e-6]
    self.f_min = f_min # [1.0]
    self.f_max = f_max # [1.0]
    self.cycle_lengths = cycle_lengths # [10000000000000]
    self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths)) # [ 0  10000000000000]
    # [0] + list(self.cycle_lengths)): [0, 10000000000000]
    self.last_f = 0.
    self.verbosity_interval = verbosity_interval # 0

def find_in_interval(self, n):
    interval = 0
    for cl in self.cum_cycles[1:]: # cl: 10000000000000
        if n <= cl:
            return interval # 0
        interval += 1

def schedule(self, n, **kwargs):

    cycle = self.find_in_interval(n) # 0
    n = n - self.cum_cycles[cycle] # n - 0 = n

    if self.verbosity_interval > 0: # False
        if n % self.verbosity_interval == 0: 
            print(f"current step: {n}, recent lr-multiplier: {self.last_f}, current cycle {cycle}")

    if n < self.lr_warm_up_steps[cycle]: # self.lr_warm_up_steps[cycle]: 0 -> False
        f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
        self.last_f = f
        return f

    else:
        t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
        # (n - 0)/(10000000000000 - 0) = n/10000000000000
        t = min(t, 1.0)
        f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (1 + np.cos(t * np.pi))
        # 1.0 + 0.5*0 = 1.0
        self.last_f = f # 1.0
        return f

def __call__(self, n, **kwargs):
    return self.schedule(n, **kwargs)

class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):

def schedule(self, n, **kwargs): # kwargs: {}

    cycle = self.find_in_interval(n) # 0
    n = n - self.cum_cycles[cycle] # n - 0

    if self.verbosity_interval > 0: # False
        if n % self.verbosity_interval == 0: 
            print(f"current step: {n}, recent lr-multiplier: {self.last_f}, current cycle {cycle}")

    if n < self.lr_warm_up_steps[cycle]:
        f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
        # f = (1.0 - 1e-6) / 2500*n + 1e-6
        self.last_f = f
        return f

    else:
        f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
        # f = 1.0 + (1.0 - 1.0)*(10000000000000 - n)/10000000000000 = 1.0
        self.last_f = f
        return f

def main(args):

args.world_size = int(os.environ['WORLD_SIZE'])
args.local_rank = int(os.environ['LOCAL_RANK'])
args.rank = int(os.environ['RANK'])
args.device = torch.device("cuda", args.local_rank)

dist.init_process_group("nccl", rank=args.rank, world_size=args.world_size) # ('nccl', 0, 1)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True


global_step = 0
model = torch.nn.Linear(10,1)
model.to(args.local_rank) 

torch.cuda.set_device(args.local_rank)
model = DDP(model, 
            device_ids=[args.local_rank],  # [0]
            output_device=args.local_rank, # 0
            broadcast_buffers=False,
            bucket_cap_mb=128,
            find_unused_parameters=True,
            gradient_as_bucket_view=True # This will save memory
)

optimizer = ZeroRedundancyOptimizer(
     params = model.parameters(), 
     lr=args.lr, # 1e-05
     optimizer_class=torch.optim.AdamW,
     weight_decay=args.weight_decay, # 0
     )

lr_scheduler = LambdaLinearScheduler(
                            warm_up_steps=[args.lr_anneal_steps], # [0] (note. 2500 for resuming. 10000 if starting from scratch)
                            cycle_lengths=[10000000000000], # Incredibly large number to prevent corner cases
                            f_start=[1.e-6], 
                            f_max=[1.], 
                            f_min=[1.]
                            )

lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_scheduler.schedule)
lr_scheduler.step(global_step)


lr_list = []
for itr in range(global_step, args.num_train_steps): 

    lr_scheduler.step()
    optimizer.consolidate_state_dict(to=0) # We need to consolidate optimizer state to cuda:0 to call print function below
    lr = optimizer.state_dict()['param_groups'][0]['lr']
    #print('lr:', lr)
    lr_list.append(lr)
    global_step += 1

with open(r'/mnt/c/dreambyte/MagicPose/exercise/lr_list.txt', 'w') as fp:
    fp.write("\n".join(str(item) for item in lr_list))
    fp.close()

import matplotlib.pyplot as plt
xs=[x for x in range(len(lr_list))]
fig = plt.figure()
plt.plot(xs, lr_list)
plt.savefig('lr_list.png', dpi = fig.dpi)

if name == "main":

str2bool = lambda arg: bool(int(arg))
parser = argparse.ArgumentParser(description='Control Net training')

parser.add_argument('--num_train_steps', type = int, default = 20000, help='number of train steps (1000000)')
parser.add_argument('--lr', type = float, default = 1e-05, help='initial lr value')
parser.add_argument('--weight_decay', type = float, default = 0, help='weight decay (L2) regularization')
#parser.add_argument('--lr_anneal_steps', type = float, default = 0, help='steps for learning rate annealing')
parser.add_argument('--lr_anneal_steps', type = float, default = 10000, help='steps for learning rate annealing')
# 2500 for resume, 10000 for start from scratch
args = parser.parse_args()
main(args)`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants