Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

support pytorch lightning 1.7 #196

Open
wants to merge 38 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d9e6ea7
update lightning
JiahaoYao Aug 11, 2022
5992c73
change gpu to cuda
JiahaoYao Aug 12, 2022
63bb622
accelerator is successfully removed
JiahaoYao Aug 12, 2022
61a3775
remove the accelerator
JiahaoYao Aug 12, 2022
db2f6ca
progressive bar
JiahaoYao Aug 12, 2022
282b993
adding the transform
JiahaoYao Aug 12, 2022
fad6be9
change the max step to be -1
JiahaoYao Aug 12, 2022
868688d
fix ci
JiahaoYao Aug 12, 2022
612d08f
checkpoint
JiahaoYao Aug 12, 2022
302f047
checkpoint_callback
JiahaoYao Aug 12, 2022
5ba8227
adding the version
JiahaoYao Aug 13, 2022
878c4b9
fix 'MNISTDataModule' object has no attribute 'train_transforms' issue
JiahaoYao Aug 15, 2022
7eb7c83
fix the issue
JiahaoYao Aug 15, 2022
d063b41
nit
JiahaoYao Aug 15, 2022
bb5f8ff
remove progress bar
JiahaoYao Aug 15, 2022
966d819
remove accelerator
JiahaoYao Aug 15, 2022
b3bc93d
adding remote back
JiahaoYao Aug 15, 2022
c3f5ce8
update bolts
JiahaoYao Aug 15, 2022
e6e3817
kit start
JiahaoYao Aug 16, 2022
d15f24c
split the testing
JiahaoYao Aug 18, 2022
8bfdb20
Merge remote-tracking branch 'upstream/main' into rlt_1.7_0811
JiahaoYao Aug 18, 2022
6bb1acd
put it back
JiahaoYao Aug 18, 2022
d868524
fix the ci here
JiahaoYao Aug 18, 2022
f7ed645
test memory
JiahaoYao Aug 18, 2022
1c63f48
update the cpu number to 6
JiahaoYao Aug 18, 2022
55e514e
adding the pip list
JiahaoYao Aug 19, 2022
06b84c9
addikng this
JiahaoYao Aug 19, 2022
c607f0d
adding the debug
JiahaoYao Aug 19, 2022
645fed2
Merge remote-tracking branch 'upstream/main' into rlt_1.7_0811
JiahaoYao Sep 29, 2022
81fb6a4
update the lightning version
JiahaoYao Sep 29, 2022
ecd9fac
rerun the ci test
JiahaoYao Sep 29, 2022
98ea680
adding the siginature pack
JiahaoYao Oct 3, 2022
fc630a6
new line
JiahaoYao Oct 3, 2022
b0d4cd1
import lib breaks due eto https://stackoverflow.com/questions/7392956…
JiahaoYao Oct 3, 2022
665e8fd
fix the lint
JiahaoYao Oct 3, 2022
3868cff
switch
JiahaoYao Oct 3, 2022
a277600
import failure
JiahaoYao Oct 3, 2022
c08a8f5
nit
JiahaoYao Oct 3, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -223,4 +223,4 @@ jobs:
run: |
pushd ray_lightning/tests
python -m pytest -v --durations=0 -x test_ddp.py
python -m pytest -v --durations=0 -x test_horovod.py
python -m pytest -v --durations=0 -x test_horovod.py
JiahaoYao marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ ray.init("ray://<head_node_host>:10001")
```
Now you can run your training script on the laptop, but have it execute as if your laptop has all the resources of the cluster essentially providing you with an **infinite laptop**.

**Note:** When using with Ray Client, you must disable checkpointing and logging for your Trainer by setting `checkpoint_callback` and `logger` to `False`.
**Note:** When using with Ray Client, you must disable checkpointing and logging for your Trainer by setting `enable_checkpointing` and `logger` to `False`.

## Horovod Strategy on Ray
Or if you prefer to use Horovod as the distributed training protocol, use the `HorovodRayStrategy` instead.
Expand Down
4 changes: 2 additions & 2 deletions ray_lightning/accelerators/delayed_gpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
import torch

from pytorch_lightning.accelerators import Accelerator,\
GPUAccelerator
CUDAAccelerator


class _GPUAccelerator(GPUAccelerator):
class _GPUAccelerator(CUDAAccelerator):
"""Accelerator for GPU devices.

adapted from:
Expand Down
4 changes: 3 additions & 1 deletion ray_lightning/examples/ray_ddp_sharded_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def download_data():
num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data)

dm = MNISTDataModule(data_dir, batch_size=batch_size)
dm.train_transforms = None
dm.val_transforms = None

model = ImageGPT(
embed_dim=embed_dim, layers=16, heads=4, vocab_size=32, num_pixels=28)
Expand Down Expand Up @@ -130,4 +132,4 @@ def download_data():
batch_size=args.batch_size,
embed_dim=args.embed_dim,
max_epochs=args.num_epochs,
max_steps=None)
max_steps=-1)
4 changes: 3 additions & 1 deletion ray_lightning/examples/ray_ddp_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ def download_data():
trainer = pl.Trainer(
max_epochs=num_epochs,
callbacks=callbacks,
progress_bar_refresh_rate=0,
enable_progress_bar=False,
strategy=RayStrategy(
num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data))
dm = MNISTDataModule(
data_dir=data_dir, num_workers=1, batch_size=config["batch_size"])
dm.train_transforms = None
dm.val_transforms = None
trainer.fit(model, dm)


Expand Down
2 changes: 1 addition & 1 deletion ray_lightning/launchers/ray_horovod_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def _wrapping_function(
`_wrapping_function` is run on each remote worker.
`function(*args, **kwargs)` is where the actual training happens.
"""

self._strategy.set_remote(True)

# `function` is a trainer's instance method
Expand Down Expand Up @@ -193,6 +192,7 @@ def _wrapping_function(
rank_zero_only.rank = self.global_rank
set_cuda_device_if_used(trainer.strategy)

trainer.strategy.set_remote(True)
# Move the model to the appropriate device.
trainer.strategy.model_to_device()

Expand Down
1 change: 1 addition & 0 deletions ray_lightning/launchers/ray_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ def _wrapping_function(
init_session(rank=global_rank, queue=tune_queue)

self._strategy._worker_setup(process_idx=global_rank)
trainer.strategy.set_remote(True)
trainer.strategy.root_device = self._strategy.root_device
trainer.strategy.global_rank = self._strategy.global_rank
trainer.strategy.local_rank = self._strategy.local_rank
Expand Down
3 changes: 2 additions & 1 deletion ray_lightning/ray_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,5 +336,6 @@ def teardown(self) -> None:
This function is overriding ddp_spawn_strategy's method.
It is run on the driver processes.
"""
self.accelerator = None
super().teardown()
if not self._is_remote:
self.accelerator = None
3 changes: 2 additions & 1 deletion ray_lightning/ray_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,9 @@ def teardown(self) -> None:
It is run on the driver process.
"""
self.join()
self.accelerator = None
super().teardown()
if not self._is_remote:
self.accelerator = None

@property
def is_distributed(self):
Expand Down
7 changes: 5 additions & 2 deletions ray_lightning/tests/test_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers):
model = LightningMNISTClassifier(config, tmpdir)
dm = MNISTDataModule(
data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
dm.train_transforms = None
dm.val_transforms = None
strategy = RayStrategy(num_workers=num_workers, use_gpu=False)
trainer = get_trainer(
tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy)
Expand All @@ -280,6 +282,8 @@ def test_predict_client(tmpdir, start_ray_client_server_2_cpus, seed,
model = LightningMNISTClassifier(config, tmpdir)
dm = MNISTDataModule(
data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
dm.train_transforms = None
dm.val_transforms = None
strategy = RayStrategy(num_workers=num_workers, use_gpu=False)
trainer = get_trainer(
tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy)
Expand All @@ -300,8 +304,7 @@ def test_early_stop(tmpdir, ray_start_2_cpus):
callbacks=[early_stop],
num_sanity_val_steps=0,
limit_train_batches=1.0,
limit_val_batches=1.0,
progress_bar_refresh_rate=1)
limit_val_batches=1.0)
trainer.fit(model)
trained_model = BoringModel.load_from_checkpoint(
trainer.checkpoint_callback.best_model_path)
Expand Down
2 changes: 2 additions & 0 deletions ray_lightning/tests/test_ddp_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def test_predict(tmpdir, ray_start_2_gpus, seed, num_workers):
model = LightningMNISTClassifier(config, tmpdir)
dm = MNISTDataModule(
data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
dm.train_transforms = None
dm.val_transforms = None
strategy = RayStrategy(num_workers=num_workers, use_gpu=True)
trainer = get_trainer(
tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy)
Expand Down
6 changes: 6 additions & 0 deletions ray_lightning/tests/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers):
model = LightningMNISTClassifier(config, tmpdir)
dm = MNISTDataModule(
data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
dm.train_transforms = None
dm.val_transforms = None
strategy = HorovodRayStrategy(num_workers=num_workers, use_gpu=False)
trainer = get_trainer(
tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy)
Expand All @@ -105,6 +107,8 @@ def test_predict_client(tmpdir, start_ray_client_server_2_cpus, seed,
model = LightningMNISTClassifier(config, tmpdir)
dm = MNISTDataModule(
data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
dm.train_transforms = None
dm.val_transforms = None
strategy = HorovodRayStrategy(num_workers=num_workers, use_gpu=False)
trainer = get_trainer(
tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy)
Expand Down Expand Up @@ -147,6 +151,8 @@ def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_workers):
model = LightningMNISTClassifier(config, tmpdir)
dm = MNISTDataModule(
data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
dm.train_transforms = None
dm.val_transforms = None
strategy = HorovodRayStrategy(num_workers=num_workers, use_gpu=True)
trainer = get_trainer(
tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy)
Expand Down
4 changes: 2 additions & 2 deletions ray_lightning/tests/test_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

@pytest.fixture
def ray_start_4_cpus():
address_info = ray.init(num_cpus=4)
address_info = ray.init(num_cpus=6)
yield address_info
ray.shutdown()

Expand All @@ -31,7 +31,7 @@ def _inner_train(config):
dir,
callbacks=callbacks,
strategy=strategy,
checkpoint_callback=False,
enable_checkpointing=False,
**config)
trainer.fit(model)

Expand Down
7 changes: 5 additions & 2 deletions ray_lightning/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def get_trainer(dir,
limit_train_batches: int = 10,
limit_val_batches: int = 10,
callbacks: Optional[List[Callback]] = None,
checkpoint_callback: bool = True,
enable_checkpointing: bool = True,
**trainer_kwargs) -> Trainer:
"""Returns a Pytorch Lightning Trainer with the provided arguments."""
callbacks = [] if not callbacks else callbacks
Expand All @@ -228,7 +228,7 @@ def get_trainer(dir,
limit_train_batches=limit_train_batches,
limit_val_batches=limit_val_batches,
enable_progress_bar=False,
checkpoint_callback=checkpoint_callback,
enable_checkpointing=enable_checkpointing,
**trainer_kwargs)
return trainer

Expand Down Expand Up @@ -256,8 +256,11 @@ def load_test(trainer: Trainer, model: LightningModule):
def predict_test(trainer: Trainer, model: LightningModule,
dm: LightningDataModule):
"""Checks if the trained model has high accuracy on the test set."""
dm.train_transforms = None
dm.val_transforms = None
trainer.fit(model, datamodule=dm)
model = trainer.lightning_module
dm.test_transforms = None
dm.setup(stage="test")
test_loader = dm.test_dataloader()
acc = torchmetrics.Accuracy()
Expand Down
6 changes: 3 additions & 3 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ flake8-comprehensions
flake8-quotes
yapf==0.23.0
pytest
pytorch-lightning==1.6.4
lightning-bolts==0.3.3
pytorch-lightning==1.7.7
lightning-bolts==0.4.0
ray[tune]
torch==1.12.0
torch==1.12.1
torchmetrics
torchvision
protobuf<=3.20.1
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
setup(
name="ray_lightning",
packages=find_packages(where=".", include="ray_lightning*"),
version="0.3.0",
version="0.4.0",
author="Ray Team",
description="Ray distributed strategies for Pytorch Lightning.",
long_description="Custom Pytorch Lightning distributed strategies "
"built on top of distributed computing framework Ray.",
url="https://github.com/ray-project/ray_lightning_accelerators",
install_requires=["pytorch-lightning==1.6.*", "ray"])
install_requires=["pytorch_lightning>=1.6.4,<=1.7.7", "ray"])