You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have forked the docker from the README and installed the dependencies from requirements.txt. One difference is I'm using singularity instead of docker.
I face the following error only when executing main.py with mosaic-bert-base-uncased.yaml (hf_bert works fine)
Here is the error, I see after tokenization. I would appreciate any guidance you can give me. Thanks for the amazing work!
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/u/user/examples/examples/benchmarks/bert/main.py", line 269, in
main(cfg)
File "/u/user/examples/examples/benchmarks/bert/main.py", line 256, in main
trainer.fit()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 1766, in fit
self._train_loop()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 1940, in _train_loop
total_loss_dict = self._train_batch(use_grad_scaling)
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2115, in _train_batch
optimizer.step(closure=lambda **kwargs: self._train_microbatches(
File "/usr/lib/python3/dist-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/usr/lib/python3/dist-packages/torch/optim/optimizer.py", line 140, in wrapper
out = func(*args, **kwargs)
File "/usr/lib/python3/dist-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/composer/optim/decoupled_weight_decay.py", line 288, in step
loss = closure()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2115, in
optimizer.step(closure=lambda **kwargs: self._train_microbatches(
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2213, in _train_microbatches
microbatch_loss_dict = self._train_microbatch(use_grad_scaling, current_batch_size, is_final_microbatch)
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2276, in _train_microbatch
self.state.outputs = self.state.model(self.state.batch)
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/composer/models/huggingface.py", line 314, in forward
output = self.model(**batch) # type: ignore (thirdparty)
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 858, in forward
outputs = self.bert(
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 677, in forward
encoder_outputs = self.encoder(
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 533, in forward
hidden_states = layer_module(hidden_states,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 395, in forward
attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 307, in forward
self_output = self.self(input_tensor, cu_seqlens, max_s, indices,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 241, in forward
attention = flash_attn_qkvpacked_func(qkv, bias)
File "/u/user/examples/examples/benchmarks/bert/src/flash_attn_triton.py", line 1021, in forward
o, lse, ctx.softmax_scale = _flash_attn_forward(
File "/u/user/examples/examples/benchmarks/bert/src/flash_attn_triton.py", line 826, in _flash_attn_forward
_fwd_kernel[grid]( # type: ignore
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/jit.py", line 106, in launcher
return self.run(*args, grid=grid, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 86, in run
return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 200, in run
return self.fn.run(*args, **kwargs)
File "", line 41, in _fwd_kernel
File "/u/user/.local/lib/python3.10/site-packages/triton/compiler.py", line 1239, in compile
so = _build(fn.name, src_path, tmpdir)
File "/u/user/.local/lib/python3.10/site-packages/triton/compiler.py", line 1169, in _build
ret = subprocess.check_call(cc_cmd)
File "/usr/lib/python3.10/subprocess.py", line 364, in check_call
retcode = call(*popenargs, **kwargs)
File "/usr/lib/python3.10/subprocess.py", line 345, in call
with Popen(*popenargs, **kwargs) as p:
File "/usr/lib/python3.10/subprocess.py", line 971, in init
self._execute_child(args, executable, preexec_fn, close_fds,
File "/usr/lib/python3.10/subprocess.py", line 1863, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: '/sw/spack/sys11-2023-03/apps/linux-rhel8-x86_64/gcc-8.5.0/gcc-11.4.0-yycklku/bin/gcc'
The text was updated successfully, but these errors were encountered:
naveenkumar2703
changed the title
Error when training with Mosac-Bert
Error when training with Mosaic-Bert
Feb 5, 2024
I have forked the docker from the README and installed the dependencies from requirements.txt. One difference is I'm using singularity instead of docker.
I face the following error only when executing main.py with mosaic-bert-base-uncased.yaml (hf_bert works fine)
Here is the error, I see after tokenization. I would appreciate any guidance you can give me. Thanks for the amazing work!
Traceback (most recent call last):
File "", line 21, in _fwd_kernel
KeyError: ('2-.-0-.-0-d82511111ad128294e9d31a6ac684238-7929002797455b30efce6e41eddc6b57-3aa563e00c5c695dd945e23b09a86848-d962222789c30252d492a16cca3bf467-ff946bd4b3b4a4cbdf8cedc6e1c658e0-5c5e32ff210f3b7f56c98ca29917c25e-06f0df2d61979d629033f4a22eff5198-0dd03b0bd512a184b3512b278d9dfa59-d35ab04ae841e2714a253c523530b071', (torch.bfloat16, torch.bfloat16, torch.bfloat16, torch.float32, torch.bfloat16, torch.float32, torch.float32, 'fp32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), ('matrix', False, 64, True, True, True, 128, 128), (True, True, True, True, True, True, True, (False,), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (False, False), (True, False), (True, False), (True, False), (True, False), (False, False), (False, False)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/u/user/examples/examples/benchmarks/bert/main.py", line 269, in
main(cfg)
File "/u/user/examples/examples/benchmarks/bert/main.py", line 256, in main
trainer.fit()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 1766, in fit
self._train_loop()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 1940, in _train_loop
total_loss_dict = self._train_batch(use_grad_scaling)
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2115, in _train_batch
optimizer.step(closure=lambda **kwargs: self._train_microbatches(
File "/usr/lib/python3/dist-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/usr/lib/python3/dist-packages/torch/optim/optimizer.py", line 140, in wrapper
out = func(*args, **kwargs)
File "/usr/lib/python3/dist-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/composer/optim/decoupled_weight_decay.py", line 288, in step
loss = closure()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2115, in
optimizer.step(closure=lambda **kwargs: self._train_microbatches(
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2213, in _train_microbatches
microbatch_loss_dict = self._train_microbatch(use_grad_scaling, current_batch_size, is_final_microbatch)
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2276, in _train_microbatch
self.state.outputs = self.state.model(self.state.batch)
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/composer/models/huggingface.py", line 314, in forward
output = self.model(**batch) # type: ignore (thirdparty)
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 858, in forward
outputs = self.bert(
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 677, in forward
encoder_outputs = self.encoder(
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 533, in forward
hidden_states = layer_module(hidden_states,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 395, in forward
attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 307, in forward
self_output = self.self(input_tensor, cu_seqlens, max_s, indices,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 241, in forward
attention = flash_attn_qkvpacked_func(qkv, bias)
File "/u/user/examples/examples/benchmarks/bert/src/flash_attn_triton.py", line 1021, in forward
o, lse, ctx.softmax_scale = _flash_attn_forward(
File "/u/user/examples/examples/benchmarks/bert/src/flash_attn_triton.py", line 826, in _flash_attn_forward
_fwd_kernel[grid]( # type: ignore
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/jit.py", line 106, in launcher
return self.run(*args, grid=grid, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 86, in run
return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 200, in run
return self.fn.run(*args, **kwargs)
File "", line 41, in _fwd_kernel
File "/u/user/.local/lib/python3.10/site-packages/triton/compiler.py", line 1239, in compile
so = _build(fn.name, src_path, tmpdir)
File "/u/user/.local/lib/python3.10/site-packages/triton/compiler.py", line 1169, in _build
ret = subprocess.check_call(cc_cmd)
File "/usr/lib/python3.10/subprocess.py", line 364, in check_call
retcode = call(*popenargs, **kwargs)
File "/usr/lib/python3.10/subprocess.py", line 345, in call
with Popen(*popenargs, **kwargs) as p:
File "/usr/lib/python3.10/subprocess.py", line 971, in init
self._execute_child(args, executable, preexec_fn, close_fds,
File "/usr/lib/python3.10/subprocess.py", line 1863, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: '/sw/spack/sys11-2023-03/apps/linux-rhel8-x86_64/gcc-8.5.0/gcc-11.4.0-yycklku/bin/gcc'
The text was updated successfully, but these errors were encountered: