Skip to content

Commit

Permalink
Showcase torch compile with hpu (#119)
Browse files Browse the repository at this point in the history
Showcase torch compile with hpu

Signed-off-by: Jerome <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec <[email protected]>
  • Loading branch information
3 people authored Dec 1, 2023
1 parent 7e077ad commit 99622f5
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 7 deletions.
3 changes: 1 addition & 2 deletions .azure/hpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,12 @@ jobs:
test_pytorch/test_dynamic_shapes.py \
test_pytorch/test_datamodule.py \
test_pytorch/test_profiler.py \
test_pytorch/test_compile.py \
--hpus 1 --junitxml=hpu_test-torch-results.xml
workingDirectory: tests/
displayName: 'HPU General tests'
- bash: |
set -ex
python -m pytest -sv test_pytorch/test_deepspeed.py \
--junitxml=hpu_deepspeed_test-results.xml
workingDirectory: tests/
Expand All @@ -133,7 +133,6 @@ jobs:
displayName: 'HPU precision test'
- bash: |
set -ex
bash run_standalone_tests.sh --hpus 2
workingDirectory: tests/
displayName: 'Multi card(2) HPU test'
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Added support for Deepspeed inference on HPU with tests and documentation ([#110](https://github.com/Lightning-AI/lightning-Habana/pull/110))
- Added tests, examples, and documentation for dynamic shapes with recipe caching ([#107](https://github.com/Lightning-AI/lightning-Habana/pull/107))
- Added preview of torch compile with tests and documentation ([#119](https://github.com/Lightning-AI/lightning-Habana/pull/119))

### Changed

- Changed HPU docker image based on Synapse AI release 1.13.0 ([#114](https://github.com/Lightning-AI/lightning-Habana/pull/114))
- Changed Multicard tests to run standalone ([#118](https://github.com/Lightning-AI/lightning-Habana/pull/118))
-

### Fixed
Expand Down
16 changes: 16 additions & 0 deletions docs/source/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -693,3 +693,19 @@ Limitations of HPU Graphs
* Using HPU Graphs with `torch.compile` is not supported.

Please refer to `Limitations of HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/HPU_Graphs_Training.html#limitations-of-hpu-graph-apis>`_


----

Using torch compile
------------------------

PyTorch Eager mode and Eager mode with `torch.compile` are available for early preview.
New compile backends to support hpu namely `aot_hpu_training_backend` for training and `aot_hpu_inference_backend` for inference are added.

.. code-block:: python
compiled_train_model = torch.compile(model_to_train, backend="aot_hpu_training_backend")
compiled_eval_model = torch.compile(model_to_eval, backend="aot_hpu_inference_backend")
Please refer to `GAUDI Release Notes <https://docs.habana.ai/en/latest/Release_Notes/GAUDI_Release_Notes.html>`_
8 changes: 8 additions & 0 deletions src/lightning_habana/pytorch/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from lightning_habana.utils.resources import _parse_hpus, device_count, get_device_stats

if _HABANA_FRAMEWORK_AVAILABLE:
import habana_frameworks.torch.core as htcore
import habana_frameworks.torch.hpu as torch_hpu


Expand Down Expand Up @@ -91,6 +92,13 @@ def get_device_name() -> str:
except (AttributeError, NameError):
return ""

@staticmethod
def is_lazy() -> bool:
"""Checks if lazy is enabled or not."""
if _HABANA_FRAMEWORK_AVAILABLE and htcore.is_lazy():
return True
return False

@classmethod
def register_accelerators(cls, accelerator_registry: Dict) -> None:
accelerator_registry.register(
Expand Down
8 changes: 4 additions & 4 deletions tests/test_pytorch/test_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ def test_hpu_parallel_reduce_op_strategy_default():


@pytest.mark.standalone()
@pytest.mark.skip(reason="fix ddp pytest issue")
@pytest.mark.skip(reason="TBD: make every parameterized tests standalone")
@pytest.mark.skipif(HPUAccelerator.auto_device_count() < 2, reason="Test requires multiple HPU devices")
@pytest.mark.parametrize(
("reduce_op", "expectation"),
Expand Down Expand Up @@ -453,7 +453,7 @@ def test_reduce_op_strategy(tmpdir, hpus, reduce_op, expectation):
default_root_dir=tmpdir,
accelerator=HPUAccelerator(),
devices=hpus,
strategy=MockHPUParallelStrategy(reduce_op=reduce_op, start_method="spawn"),
strategy=MockHPUParallelStrategy(reduce_op=reduce_op),
max_epochs=1,
fast_dev_run=3,
plugins=HPUPrecisionPlugin(precision="bf16-mixed"),
Expand All @@ -463,7 +463,7 @@ def test_reduce_op_strategy(tmpdir, hpus, reduce_op, expectation):


@pytest.mark.standalone()
@pytest.mark.skip(reason="fix ddp pytest issue")
@pytest.mark.skip(reason="TBD: make every parameterized tests standalone")
@pytest.mark.skipif(HPUAccelerator.auto_device_count() < 2, reason="Test requires multiple HPU devices")
@pytest.mark.parametrize(
("reduce_op", "logged_value_epoch", "logged_value_step"),
Expand Down Expand Up @@ -491,7 +491,7 @@ def test_reduce_op_logging(tmpdir, hpus, reduce_op, logged_value_epoch, logged_v
default_root_dir=tmpdir,
accelerator=HPUAccelerator(),
devices=hpus,
strategy=HPUParallelStrategy(start_method="spawn"),
strategy=HPUParallelStrategy(),
max_epochs=1,
fast_dev_run=3,
plugins=HPUPrecisionPlugin(precision="bf16-mixed"),
Expand Down
242 changes: 242 additions & 0 deletions tests/test_pytorch/test_compile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pytest
import torch
import torch.nn as nn
import torch.nn.functional as func
from lightning_utilities import module_available

if module_available("lightning"):
from lightning.pytorch import LightningModule, Trainer
from lightning.pytorch.demos.boring_classes import BoringModel
from lightning.pytorch.demos.mnist_datamodule import MNISTDataModule
from lightning.pytorch.utilities.compile import from_compiled, to_uncompiled
elif module_available("pytorch_lightning"):
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.demos.boring_classes import BoringModel
from pytorch_lightning.demos.mnist_datamodule import MNISTDataModule

from lightning_habana.pytorch.accelerator import HPUAccelerator
from lightning_habana.pytorch.plugins import HPUPrecisionPlugin
from lightning_habana.pytorch.strategies import HPUParallelStrategy, SingleHPUStrategy


@pytest.fixture()
def _is_compile_allowed():
if HPUAccelerator.is_lazy():
pytest.skip("Test requires lazy mode to be disabled")


@pytest.mark.usefixtures("_is_compile_allowed")
def test_compiler_context(tmp_path):
model = BoringModel()
compiled_model = torch.compile(model, backend="aot_hpu_training_backend")
assert model._compiler_ctx is compiled_model._compiler_ctx # shared reference


@pytest.mark.skipif(not module_available("lightning"), reason="Test requires lightning package")
@pytest.mark.usefixtures("_is_compile_allowed")
def test_lightning_compile_uncompile():
model = BoringModel()
compiled_model = torch.compile(model, backend="aot_hpu_training_backend")

def has_dynamo(fn):
return any(el for el in dir(fn) if el.startswith("_torchdynamo"))

from_compiled_model = from_compiled(compiled_model)
assert isinstance(from_compiled_model, LightningModule)
assert from_compiled_model._compiler_ctx is not None
assert has_dynamo(from_compiled_model.forward)
assert has_dynamo(from_compiled_model.training_step)
assert has_dynamo(from_compiled_model.validation_step)
assert has_dynamo(from_compiled_model.test_step)
assert has_dynamo(from_compiled_model.predict_step)

to_uncompiled_model = to_uncompiled(model)
assert to_uncompiled_model._compiler_ctx is None
assert to_uncompiled_model.forward == model.forward
assert to_uncompiled_model.training_step == model.training_step
assert to_uncompiled_model.validation_step == model.validation_step
assert to_uncompiled_model.test_step == model.test_step
assert to_uncompiled_model.predict_step == model.predict_step
assert not has_dynamo(to_uncompiled_model.forward)
assert not has_dynamo(to_uncompiled_model.training_step)
assert not has_dynamo(to_uncompiled_model.validation_step)
assert not has_dynamo(to_uncompiled_model.test_step)
assert not has_dynamo(to_uncompiled_model.predict_step)


@pytest.mark.usefixtures("_is_compile_allowed")
def test_compiled_model_to_log_metric(tmp_path):
class MyModel(BoringModel):
def training_step(self, batch, batch_idx):
loss = self.step(batch)
self.log("loss", loss)
return loss

model = MyModel()
compiled_model = torch.compile(model, backend="aot_hpu_training_backend")

_strategy = SingleHPUStrategy()

trainer = Trainer(
default_root_dir=tmp_path,
accelerator=HPUAccelerator(),
fast_dev_run=True,
strategy=_strategy,
devices=1,
enable_checkpointing=False,
enable_model_summary=False,
enable_progress_bar=False,
)
trainer.fit(compiled_model)

assert set(trainer.callback_metrics) == {"loss"}


class LitClassifier(LightningModule):
def __init__(self):
super().__init__()
self.l1 = torch.nn.Linear(28 * 28, 10)

def forward(self, x):
return torch.relu(self.l1(x.view(x.size(0), -1)))

def training_step(self, batch, batch_idx):
x, y = batch
loss = func.cross_entropy(self(x), y)
self.log("loss", loss)
return loss

def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=0.02)


@pytest.mark.usefixtures("_is_compile_allowed")
def test_compiled_model_with_datamodule_and_log_metric(tmp_path):
dm = MNISTDataModule(batch_size=32)
model = LitClassifier()
compiled_model = torch.compile(model, backend="aot_hpu_training_backend")
_strategy = SingleHPUStrategy()

trainer = Trainer(
default_root_dir=tmp_path,
accelerator=HPUAccelerator(),
fast_dev_run=True,
strategy=_strategy,
devices=1,
enable_checkpointing=False,
enable_model_summary=False,
enable_progress_bar=False,
)
trainer.fit(compiled_model, datamodule=dm)


@pytest.mark.usefixtures("_is_compile_allowed")
def test_trainer_fit_with_compiled_model(tmp_path):
"""Tests compiled BoringModel on HPU."""
model = BoringModel()
compiled_model = torch.compile(model, backend="aot_hpu_training_backend")

_strategy = SingleHPUStrategy()
_plugins = [HPUPrecisionPlugin(precision="bf16-mixed")]

trainer = Trainer(
default_root_dir=tmp_path,
accelerator=HPUAccelerator(),
strategy=_strategy,
plugins=_plugins,
devices=1,
fast_dev_run=True,
)
trainer.fit(compiled_model)


class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(1, 1)

def forward(self, x):
x = torch.flatten(x, 1)
x = self.fc1(x)
return func.log_softmax(x, dim=1)


@pytest.mark.usefixtures("_is_compile_allowed")
def test_trainer_with_nn_module(tmp_path):
device = torch.device("hpu")
model = Net().to(device)
torch.compile(model, backend="aot_hpu_training_backend")


@pytest.mark.parametrize("hpus", [1])
@pytest.mark.usefixtures("_is_compile_allowed")
def test_all_stages_with_compile(tmpdir, hpus):
"""Tests all the model stages using BoringModel on HPU."""
model_to_train = BoringModel()
model_to_eval = BoringModel()
compiled_train_model = torch.compile(model_to_train, backend="aot_hpu_training_backend")
compiled_eval_model = torch.compile(model_to_eval, backend="aot_hpu_inference_backend")

_strategy = SingleHPUStrategy()
_plugins = [HPUPrecisionPlugin(precision="bf16-mixed")]
trainer = Trainer(
default_root_dir=tmpdir,
fast_dev_run=True,
accelerator=HPUAccelerator(),
strategy=_strategy,
devices=hpus,
plugins=_plugins,
)
trainer.fit(compiled_train_model)
trainer.validate(compiled_eval_model)
trainer.test(compiled_eval_model)
trainer.predict(compiled_eval_model)


@pytest.mark.standalone()
@pytest.mark.skipif(HPUAccelerator.auto_device_count() <= 1, reason="Test requires multiple HPU devices")
@pytest.mark.usefixtures("_is_compile_allowed")
@pytest.mark.parametrize("hpus", [2])
def test_parallel_strategy_with_compile(tmp_path, hpus):
"""Tests compiled BoringModel on HPU."""
model = BoringModel()
compiled_model = torch.compile(model, backend="aot_hpu_training_backend")

_plugins = [HPUPrecisionPlugin(precision="bf16-mixed")]
parallel_hpus = [torch.device("hpu")] * hpus
_strategy = HPUParallelStrategy(
parallel_devices=parallel_hpus,
bucket_cap_mb=100,
gradient_as_bucket_view=True,
static_graph=True,
find_unused_parameters=True,
)

trainer = Trainer(
default_root_dir=tmp_path,
accelerator=HPUAccelerator(),
strategy=_strategy,
plugins=_plugins,
devices=hpus,
fast_dev_run=True,
)
trainer.fit(compiled_model)
assert _strategy._ddp_kwargs["bucket_cap_mb"] == 100
assert _strategy._ddp_kwargs["gradient_as_bucket_view"] is True
assert _strategy._ddp_kwargs["static_graph"] is True
assert _strategy._ddp_kwargs["find_unused_parameters"] is True

0 comments on commit 99622f5

Please sign in to comment.