You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
It raised the error when run MoD(llama3-8b) with Deepspeed:
`Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
return inner_training_loop(train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
return inner_training_loop(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
return inner_training_loop(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
self.accelerator.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
self.accelerator.backward(loss, **kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self.accelerator.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
self.engine.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
ret_val = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
self.init_state(group, p, gindex, pindex)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
buff = F.get_paged(*p.shape, dtype=dtype, device=p.device)
return wrapped(*args, **kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
return wrapped(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
AttributeError: 'NoneType' object has no attribute 'cget_managed_ptr'
out = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
out = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
self.init_state(group, p, gindex, pindex)self.init_state(group, p, gindex, pindex)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
buff = F.get_paged(*p.shape, dtype=dtype, device=p.device)buff = F.get_paged(*p.shape, dtype=dtype, device=p.device)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
AttributeErrorAttributeError: : 'NoneType' object has no attribute 'cget_managed_ptr''NoneType' object has no attribute 'cget_managed_ptr'
0%| | 0/120 [00:29<?, ?it/s]
[2024-07-04 06:19:40,168] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1412100) of binary: /usr/local/python3.10.2/bin/python3.10
Traceback (most recent call last):
File "/usr/local/python3.10.2/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
It raised the error when run MoD(llama3-8b) with Deepspeed:
`Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
return inner_training_loop(train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
return inner_training_loop(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
return inner_training_loop(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
self.accelerator.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
self.accelerator.backward(loss, **kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self.accelerator.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
self.engine.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
ret_val = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
self.init_state(group, p, gindex, pindex)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
buff = F.get_paged(*p.shape, dtype=dtype, device=p.device)
return wrapped(*args, **kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
return wrapped(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
AttributeError: 'NoneType' object has no attribute 'cget_managed_ptr'
out = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
out = func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
self.init_state(group, p, gindex, pindex)self.init_state(group, p, gindex, pindex)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)return func(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
buff = F.get_paged(*p.shape, dtype=dtype, device=p.device)buff = F.get_paged(*p.shape, dtype=dtype, device=p.device)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
AttributeErrorAttributeError: : 'NoneType' object has no attribute 'cget_managed_ptr''NoneType' object has no attribute 'cget_managed_ptr'
0%| | 0/120 [00:29<?, ?it/s]
[2024-07-04 06:19:40,168] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1412100) of binary: /usr/local/python3.10.2/bin/python3.10
Traceback (most recent call last):
File "/usr/local/python3.10.2/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/home/code/LLaMA-Factory/src/llamafactory/launcher.py FAILED
Failures:
[1]:
time : 2024-07-04_06:19:40
host : localhost.localdomain
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 1412101)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-07-04_06:19:40
host : localhost.localdomain
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 1412102)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure):
[0]:
time : 2024-07-04_06:19:40
host : localhost.localdomain
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1412100)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html`
The text was updated successfully, but these errors were encountered: