fix OOM

AllentDan · Jan 7, 2025 · fd40edb · fd40edb
1 parent 8952e80
commit fd40edb
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 2 deletions.
diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py
@@ -89,6 +89,8 @@ def smooth_quant(model: str,
         parent = model.get_submodule(parent_name)
         setattr(parent, child_name, q_linear)
         linear.to('cpu')
+        q_linear.to('cpu')
+        torch.cuda.empty_cache()
 
     for name, norm in rmsnorms.items():
         if skipped_module(name):
@@ -99,6 +101,8 @@ def smooth_quant(model: str,
         parent = model.get_submodule(parent_name)
         setattr(parent, child_name, q_norm)
         norm.to('cpu')
+        q_linear.to('cpu')
+        torch.cuda.empty_cache()
 
     if vl_model:
         from .auto_awq import save_vl_model

diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
@@ -325,6 +325,7 @@ def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
                                                          scales, zeros))
         setattr(parent, child_name, q_linear)
         fc.to('cpu')
+        torch.cuda.empty_cache()
 
         print(f'{name} weight {pack_or_skip}.')
 
@@ -366,7 +367,10 @@ def smooth_layers(layers,
             smooth_fc_fcs(fc, fcs, a_scales[a_name], group_size)
 
         layer.to('cpu')
-        print(f'{l_name} smooth weight done.')
+        torch.cuda.empty_cache()
+        max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024
+        print(f'{l_name} smooth weight done.'
+              f' max gpu memory: {max_memory:.2f} GB')
 
 
 def pseudo_quantize_tensor(w,
@@ -434,4 +438,7 @@ def awq_layers(layers,
             smooth_fc_fcs(fc, fcs, a_scales[a_name], group_size, ratio)
 
         layer.to('cpu')
-        print(f'{l_name} smooth weight done.')
+        torch.cuda.empty_cache()
+        max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024
+        print(f'{l_name} smooth weight done.'
+              f' max gpu memory: {max_memory:.2f} GB')