Open
Description
Bug description
The code can be accessed at https://www.kaggle.com/code/vigneshwar472/notebook5a03168e34
I am working on multiclass classification task and want to train a nueral network with pytorch lightning on 2x T4 GPUs on kaggle notebook.
Everything seems to work fine but I encounter this error when I fitted the trainer.
Training Step of lightning module
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.cross_entropy(logits, y)
preds = F.softmax(logits, dim=1)
preds.to(y)
self.log_dict({
"train_Loss": loss,
"train_Accuracy": self.accuracy(preds, y),
"train_Precision": self.precision(preds, y),
"train_Recall": self.recall(preds, y),
"train_F1-Score": self.f1(preds, y),
"train_F3-Score": self.f_beta(preds, y),
"train_AUROC": self.auroc(preds, y),
}, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
return loss
Initializing trainer
trainer = L.Trainer(max_epochs=5,
devices=2,
strategy='ddp_notebook',
num_sanity_val_steps=0,
profiler='simple',
default_root_dir="/kaggle/working",
callbacks=[DeviceStatsMonitor(),
StochasticWeightAveraging(swa_lrs=1e-2),
#EarlyStopping(monitor='train_Loss', min_delta=0.001, patience=100, verbose=False, mode='min'),
],
enable_progress_bar=True,
enable_model_summary=True,
)
trainer.fit(model, data_mod) => data_mod is LightningDataModule
W1116 14:03:37.546000 140135548491584 torch/multiprocessing/spawn.py:146] Terminating process 131 via signal SIGTERM
INFO: [rank: 0] Received SIGTERM: 15
---------------------------------------------------------------------------
ProcessRaisedException Traceback (most recent call last)
Cell In[14], line 1
----> 1 trainer.fit(model, data_mod)
File /opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py:538, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
536 self.state.status = TrainerStatus.RUNNING
537 self.training = True
--> 538 call._call_and_handle_interrupt(
539 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
540 )
File /opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py:46, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
44 try:
45 if trainer.strategy.launcher is not None:
---> 46 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
47 return trainer_fn(*args, **kwargs)
49 except _TunerExitException:
File /opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py:144, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
136 process_context = mp.start_processes(
137 self._wrapping_function,
138 args=process_args,
(...)
141 join=False, # we will join ourselves to get the process references
142 )
143 self.procs = process_context.processes
--> 144 while not process_context.join():
145 pass
147 worker_output = return_queue.get()
File /opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:189, in ProcessContext.join(self, timeout)
187 msg = "\n\n-- Process %d terminated with the following error:\n" % error_index
188 msg += original_trace
--> 189 raise ProcessRaisedException(msg, error_index, failed_process.pid)
ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 76, in _wrap
fn(i, *args)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
results = function(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1025, in _run_stage
self.fit_loop.run()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 205, in run
self.advance()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 363, in advance
self.epoch_loop.run(self._data_fetcher)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 140, in run
self.advance(data_fetcher)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 250, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 190, in run
self._optimizer_step(batch_idx, closure)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 268, in _optimizer_step
call._call_lightning_module_hook(
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 167, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 1306, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/optimizer.py", line 153, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 238, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/plugins/precision/precision.py", line 122, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 484, in wrapper
out = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 89, in _use_grad
ret = func(self, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 204, in step
loss = closure()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/plugins/precision/precision.py", line 108, in _wrap_closure
closure_result = closure()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 144, in __call__
self._result = self.closure(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 129, in closure
step_output = self._step_fn()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 317, in _training_step
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 389, in training_step
return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 640, in __call__
wrapper_output = wrapper_module(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1636, in forward
else self._run_ddp_forward(*inputs, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1454, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 633, in wrapped_forward
out = method(*_args, **_kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 433, in _fn
return fn(*args, **kwargs)
File "/tmp/ipykernel_30/3650372019.py", line 74, in training_step
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 437, in log
apply_to_collection(value, dict, self.__check_not_nested, name)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 438, in torch_dynamo_resume_in_log_at_437
apply_to_collection(
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 484, in torch_dynamo_resume_in_log_at_438
results.reset(metrics=False, fx=self._current_fx_name)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 508, in torch_dynamo_resume_in_log_at_484
and is_param_in_hook_signature(self.training_step, "dataloader_iter", explicit=True)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 525, in torch_dynamo_resume_in_log_at_508
results.log(
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 403, in log
metric = _ResultMetric(meta, isinstance(value, Tensor))
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 404, in torch_dynamo_resume_in_log_at_403
self[key] = metric
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 411, in torch_dynamo_resume_in_log_at_404
self[key].to(value.device)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 414, in torch_dynamo_resume_in_log_at_411
self.update_metrics(key, value, batch_size)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 419, in update_metrics
result_metric.forward(value, batch_size)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 270, in forward
self.update(value, batch_size)
File "/opt/conda/lib/python3.10/site-packages/torchmetrics/metric.py", line 483, in wrapped_func
update(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 225, in update
self._forward_cache = self.meta.sync(value.clone()) # `clone` because `sync` is in-place
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 144, in sync
assert self._sync is not None
AssertionError
Please Help me resolve this error. I am very confused what to do
What version are you seeing the problem on?
v2.4
How to reproduce the bug
Check out the kaggle notebook [](https://www.kaggle.com/code/vigneshwar472/notebook5a03168e34)
Error messages and logs
# Error messages and logs here please
Environment
Current environment
#- PyTorch Lightning Version (e.g., 2.4.0):
#- PyTorch Version (e.g., 2.4):
#- Python version (e.g., 3.12):
#- OS (e.g., Linux):
#- CUDA/cuDNN version:
#- GPU models and configuration:
#- How you installed Lightning(`conda`, `pip`, source):
More info
No response