You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
trainer.fit(model, data_mod) => data_mod is LightningDataModule
W1116 14:03:37.546000 140135548491584 torch/multiprocessing/spawn.py:146] Terminating process 131 via signal SIGTERM
INFO: [rank: 0] Received SIGTERM: 15
---------------------------------------------------------------------------
ProcessRaisedException Traceback (most recent call last)
Cell In[14], line 1
----> 1 trainer.fit(model, data_mod)
File /opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py:538, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
536 self.state.status = TrainerStatus.RUNNING
537 self.training = True
--> 538 call._call_and_handle_interrupt(
539 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
540 )
File /opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py:46, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
44 try:
45 if trainer.strategy.launcher is not None:
---> 46 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
47 return trainer_fn(*args, **kwargs)
49 except _TunerExitException:
File /opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py:144, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
136 process_context = mp.start_processes(
137 self._wrapping_function,
138 args=process_args,
(...)
141 join=False, # we will join ourselves to get the process references
142 )
143 self.procs = process_context.processes
--> 144 while not process_context.join():
145 pass
147 worker_output = return_queue.get()
File /opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:189, in ProcessContext.join(self, timeout)
187 msg = "\n\n-- Process %d terminated with the following error:\n" % error_index
188 msg += original_trace
--> 189 raise ProcessRaisedException(msg, error_index, failed_process.pid)
ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 76, in _wrap
fn(i, *args)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
results = function(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1025, in _run_stage
self.fit_loop.run()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 205, in run
self.advance()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 363, in advance
self.epoch_loop.run(self._data_fetcher)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 140, in run
self.advance(data_fetcher)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 250, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 190, in run
self._optimizer_step(batch_idx, closure)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 268, in _optimizer_step
call._call_lightning_module_hook(
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 167, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 1306, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/optimizer.py", line 153, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 238, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/plugins/precision/precision.py", line 122, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 484, in wrapper
out = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 89, in _use_grad
ret = func(self, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 204, in step
loss = closure()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/plugins/precision/precision.py", line 108, in _wrap_closure
closure_result = closure()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 144, in __call__
self._result = self.closure(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 129, in closure
step_output = self._step_fn()
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 317, in _training_step
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 389, in training_step
return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 640, in __call__
wrapper_output = wrapper_module(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1636, in forward
else self._run_ddp_forward(*inputs, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1454, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 633, in wrapped_forward
out = method(*_args, **_kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 433, in _fn
return fn(*args, **kwargs)
File "/tmp/ipykernel_30/3650372019.py", line 74, in training_step
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 437, in log
apply_to_collection(value, dict, self.__check_not_nested, name)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 438, in torch_dynamo_resume_in_log_at_437
apply_to_collection(
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 484, in torch_dynamo_resume_in_log_at_438
results.reset(metrics=False, fx=self._current_fx_name)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 508, in torch_dynamo_resume_in_log_at_484
and is_param_in_hook_signature(self.training_step, "dataloader_iter", explicit=True)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/core/module.py", line 525, in torch_dynamo_resume_in_log_at_508
results.log(
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 403, in log
metric = _ResultMetric(meta, isinstance(value, Tensor))
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 404, in torch_dynamo_resume_in_log_at_403
self[key] = metric
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 411, in torch_dynamo_resume_in_log_at_404
self[key].to(value.device)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 414, in torch_dynamo_resume_in_log_at_411
self.update_metrics(key, value, batch_size)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 419, in update_metrics
result_metric.forward(value, batch_size)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 270, in forward
self.update(value, batch_size)
File "/opt/conda/lib/python3.10/site-packages/torchmetrics/metric.py", line 483, in wrapped_func
update(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 225, in update
self._forward_cache = self.meta.sync(value.clone()) # `clone` because `sync` is in-place
File "/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 144, in sync
assert self._sync is not None
AssertionError
Please Help me resolve this error. I am very confused what to do
#- PyTorch Lightning Version (e.g., 2.4.0):
#- PyTorch Version (e.g., 2.4):
#- Python version (e.g., 3.12):
#- OS (e.g., Linux):
#- CUDA/cuDNN version:
#- GPU models and configuration:
#- How you installed Lightning(`conda`, `pip`, source):
More info
No response
The text was updated successfully, but these errors were encountered:
Bug description
The code can be accessed at https://www.kaggle.com/code/vigneshwar472/notebook5a03168e34
I am working on multiclass classification task and want to train a nueral network with pytorch lightning on 2x T4 GPUs on kaggle notebook.
Everything seems to work fine but I encounter this error when I fitted the trainer.
Training Step of lightning module
Initializing trainer
trainer.fit(model, data_mod) => data_mod is LightningDataModule
Please Help me resolve this error. I am very confused what to do
What version are you seeing the problem on?
v2.4
How to reproduce the bug
Error messages and logs
Environment
Current environment
More info
No response
The text was updated successfully, but these errors were encountered: