Merge branch 'huggingface:main' into cj_fix_deepspeed_multinode_mpi_l…

…aunchers
huggingface · Jan 27, 2025 · e287911 · e287911
2 parents b2b65c5 + 675e35b
commit e287911
Show file tree

Hide file tree

Showing 12 changed files with 54 additions and 28 deletions.
diff --git a/Makefile b/Makefile
@@ -70,3 +70,21 @@ test_prod:
 
 test_rest:
 	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "not by_step and not by_epoch" $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_rest.log",)
+
+# For developers to prepare a release
+prepare_release:
+	rm -rf dist build
+	python setup.py bdist_wheel sdist
+
+# Make sure this is ran in a fresh venv of some form
+install_test_release:
+	pip uninstall accelerate -y
+	pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple accelerate
+
+# Run as `make target=testpypi upload_release`
+upload_release:
+	@if [ "$(target)" != "testpypi" ] && [ "$(target)" != "pypi" ]; then \
+		echo "Error: target must be either 'testpypi' or 'pypi'"; \
+		exit 1; \
+	fi
+	twine upload dist/* -r $(target)
diff --git a/setup.py b/setup.py
@@ -103,20 +103,15 @@
 #      git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi'
 #    Push the tag and release commit to git: git push --tags origin vXX.xx-release
 # 5. Run the following commands in the top-level directory:
-#      rm -rf dist
-#      rm -rf build
-#      python setup.py bdist_wheel
-#      python setup.py sdist
+#      make prepare_release
 # 6. Upload the package to the pypi test server first:
-#      twine upload dist/* -r testpypi
+#      make target=testpypi upload_release
 # 7. Check that you can install it in a virtualenv by running:
-#      pip install accelerate
-#      pip uninstall accelerate
-#      pip install -i https://testpypi.python.org/pypi accelerate
+#      make install_test_release
 #      accelerate env
 #      accelerate test
 # 8. Upload the final version to actual pypi:
-#      twine upload dist/* -r pypi
+#      make target=pypi upload_release
 # 9. Add release notes to the tag in github once everything is looking hunky-dory.
 # 10. Go back to the main branch and update the version in __init__.py, setup.py to the new version ".dev" and push to
 #     main.
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -329,8 +329,8 @@ def __init__(
                 if compare_versions("deepspeed-mlu", "<", "0.10.1"):
                     raise ImportError("DeepSpeed MLU version must be >= 0.10.1. Please update DeepSpeed MLU.")
             elif is_musa_available():
-                if compare_versions("deepspeed", ">", "0.14.3"):
-                    raise ImportError("DeepSpeed MUSA version must be <= 0.14.3. Please downgrade DeepSpeed.")
+                if compare_versions("deepspeed", "<", "0.14.3"):
+                    raise ImportError("DeepSpeed MUSA version must be >= 0.14.3. Please update DeepSpeed.")
             elif compare_versions("deepspeed", "<", "0.9.3"):
                 raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.")
 

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
@@ -33,6 +33,7 @@
     WEIGHTS_NAME,
     get_pretty_name,
     is_mlu_available,
+    is_musa_available,
     is_torch_xla_available,
     is_xpu_available,
     load,
@@ -152,6 +153,8 @@ def save_accelerator_state(
         states["torch_xpu_manual_seed"] = torch.xpu.get_rng_state_all()
     if is_mlu_available():
         states["torch_mlu_manual_seed"] = torch.mlu.get_rng_state_all()
+    if is_musa_available():
+        states["torch_musa_manual_seed"] = torch.musa.get_rng_state_all()
     else:
         states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
     if is_torch_xla_available():
@@ -275,6 +278,8 @@ def load_accelerator_state(
             torch.xpu.set_rng_state_all(states["torch_xpu_manual_seed"])
         if is_mlu_available():
             torch.mlu.set_rng_state_all(states["torch_mlu_manual_seed"])
+        if is_musa_available():
+            torch.musa.set_rng_state_all(states["torch_musa_manual_seed"])
         else:
             torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
         if is_torch_xla_available():

diff --git a/src/accelerate/commands/env.py b/src/accelerate/commands/env.py
@@ -83,6 +83,8 @@ def env_command(args):
         info["GPU type"] = torch.cuda.get_device_name()
     if pt_mlu_available:
         info["MLU type"] = torch.mlu.get_device_name()
+    if pt_musa_available:
+        info["MUSA type"] = torch.musa.get_device_name()
     if pt_npu_available:
         info["CANN version"] = torch.version.cann
 

diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py
@@ -28,6 +28,7 @@
 )
 from .utils.imports import (
     is_mlu_available,
+    is_musa_available,
     is_npu_available,
     is_xpu_available,
 )
@@ -391,6 +392,8 @@ def post_forward(self, module, output):
                         device = f"npu:{device}"
                     elif is_mlu_available():
                         device = f"mlu:{device}"
+                    elif is_musa_available():
+                        device = f"musa:{device}"
                     elif is_xpu_available():
                         device = f"xpu:{device}"
                 del self.tied_params_map[value_pointer][device]

diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -41,6 +41,7 @@
     is_cuda_available,
     is_mlu_available,
     is_msamp_available,
+    is_musa_available,
     is_npu_available,
     is_transformer_engine_available,
     is_xpu_available,
@@ -1686,6 +1687,8 @@ def __post_init__(self):
                 device = torch.npu.current_device()
             elif is_mlu_available():
                 device = torch.mlu.current_device()
+            elif is_musa_available():
+                device = torch.musa.current_device()
             elif is_cuda_available():
                 device = torch.cuda.current_device()
             elif is_xpu_available():

diff --git a/tests/test_accelerator.py b/tests/test_accelerator.py
@@ -30,8 +30,9 @@
 from accelerate.state import GradientState, PartialState
 from accelerate.test_utils import (
     require_bnb,
+    require_cuda_or_xpu,
     require_huggingface_suite,
-    require_multi_gpu,
+    require_multi_device,
     require_non_cpu,
     require_transformer_engine,
     slow,
@@ -452,7 +453,7 @@ def test_is_accelerator_prepared(self):
             getattr(valid_dl, "_is_accelerate_prepared", False) is True
         ), "Valid Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
 
-    @require_cuda
+    @require_cuda_or_xpu
     @slow
     @require_bnb
     def test_accelerator_bnb(self):
@@ -498,7 +499,7 @@ def test_accelerator_bnb_cpu_error(self):
     @require_non_torch_xla
     @slow
     @require_bnb
-    @require_multi_gpu
+    @require_multi_device
     def test_accelerator_bnb_multi_device(self):
         """Tests that the accelerator can be used with the BNB library."""
         from transformers import AutoModelForCausalLM
@@ -507,6 +508,8 @@ def test_accelerator_bnb_multi_device(self):
             PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU}
         elif torch_device == "npu":
             PartialState._shared_state = {"distributed_type": DistributedType.MULTI_NPU}
+        elif torch_device == "xpu":
+            PartialState._shared_state = {"distributed_type": DistributedType.MULTI_XPU}
         else:
             raise ValueError(f"{torch_device} is not supported in test_accelerator_bnb_multi_device.")
 
@@ -534,7 +537,7 @@ def test_accelerator_bnb_multi_device(self):
     @require_non_torch_xla
     @slow
     @require_bnb
-    @require_multi_gpu
+    @require_multi_device
     def test_accelerator_bnb_multi_device_no_distributed(self):
         """Tests that the accelerator can be used with the BNB library."""
         from transformers import AutoModelForCausalLM

diff --git a/tests/test_big_modeling.py b/tests/test_big_modeling.py
@@ -36,6 +36,7 @@
 from accelerate.test_utils import (
     require_bnb,
     require_cuda,
+    require_cuda_or_xpu,
     require_multi_device,
     require_multi_gpu,
     require_non_cpu,
@@ -877,7 +878,7 @@ def test_cpu_offload_with_hook(self):
     @require_non_torch_xla
     @slow
     @require_bnb
-    @require_multi_gpu
+    @require_multi_device
     def test_dispatch_model_bnb(self):
         """Tests that `dispatch_model` quantizes int8 layers"""
         from huggingface_hub import hf_hub_download
@@ -906,7 +907,7 @@ def test_dispatch_model_bnb(self):
         assert model.h[(-1)].self_attention.query_key_value.weight.dtype == torch.int8
         assert model.h[(-1)].self_attention.query_key_value.weight.device.index == 1
 
-    @require_cuda
+    @require_cuda_or_xpu
     @slow
     @require_bnb
     def test_dispatch_model_int8_simple(self):
@@ -946,7 +947,7 @@ def test_dispatch_model_int8_simple(self):
         model = load_checkpoint_and_dispatch(
             model,
             checkpoint=model_path,
-            device_map={"": torch.device("cuda:0")},
+            device_map={"": torch_device},
         )
 
         assert model.h[0].self_attention.query_key_value.weight.dtype == torch.int8
@@ -963,13 +964,13 @@ def test_dispatch_model_int8_simple(self):
         model = load_checkpoint_and_dispatch(
             model,
             checkpoint=model_path,
-            device_map={"": "cuda:0"},
+            device_map={"": torch_device},
         )
 
         assert model.h[0].self_attention.query_key_value.weight.dtype == torch.int8
         assert model.h[0].self_attention.query_key_value.weight.device.index == 0
 
-    @require_cuda
+    @require_cuda_or_xpu
     @slow
     @require_bnb
     def test_dipatch_model_fp4_simple(self):
@@ -1010,7 +1011,7 @@ def test_dipatch_model_fp4_simple(self):
         model = load_checkpoint_and_dispatch(
             model,
             checkpoint=model_path,
-            device_map={"": torch.device("cuda:0")},
+            device_map={"": torch_device},
         )
 
         assert model.h[0].self_attention.query_key_value.weight.dtype == torch.uint8
@@ -1027,7 +1028,7 @@ def test_dipatch_model_fp4_simple(self):
         model = load_checkpoint_and_dispatch(
             model,
             checkpoint=model_path,
-            device_map={"": "cuda:0"},
+            device_map={"": torch_device},
         )
 
         assert model.h[0].self_attention.query_key_value.weight.dtype == torch.uint8

diff --git a/tests/test_kwargs_handlers.py b/tests/test_kwargs_handlers.py
@@ -27,7 +27,6 @@
     path_in_accelerate_package,
     require_multi_device,
     require_non_cpu,
-    require_non_xpu,
 )
 from accelerate.test_utils.testing import slow
 from accelerate.utils import AutocastKwargs, KwargsHandler, ProfileKwargs, TorchDynamoPlugin, clear_environment
@@ -50,7 +49,6 @@ def test_kwargs_handler(self):
         assert MockClass(a=2, c=2.25).to_kwargs() == {"a": 2, "c": 2.25}
 
     @require_non_cpu
-    @require_non_xpu
     def test_grad_scaler_kwargs(self):
         # If no defaults are changed, `to_kwargs` returns an empty dict.
         scaler_handler = GradScalerKwargs(init_scale=1024, growth_factor=2)

diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
@@ -28,7 +28,6 @@
 from accelerate import init_empty_weights
 from accelerate.big_modeling import cpu_offload
 from accelerate.test_utils import (
-    require_cuda,
     require_huggingface_suite,
     require_multi_device,
     require_non_cpu,
@@ -853,7 +852,7 @@ def test_infer_auto_device_map_with_fallback_allocation_and_buffers(self):
         expected_device_map = {"batchnorm": 0, "linear1": "disk", "linear2": "disk"}
         assert device_map == expected_device_map
 
-    @require_cuda
+    @require_non_cpu
     def test_get_balanced_memory(self):
         model = ModelForTest()
         # model has size 236: linear1 64, batchnorm 72, linear2 100

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -19,7 +19,7 @@
 
 from accelerate import Accelerator
 from accelerate.state import AcceleratorState
-from accelerate.test_utils import require_cpu, require_non_cpu, require_non_xpu
+from accelerate.test_utils import require_cpu, require_non_cpu
 
 
 @require_cpu
@@ -37,7 +37,6 @@ def test_accelerated_optimizer_pickling(self):
 
 
 @require_non_cpu
-@require_non_xpu
 class OptimizerTester(unittest.TestCase):
     def test_accelerated_optimizer_step_was_skipped(self):
         model = torch.nn.Linear(5, 5)