Closed as not planned
Closed as not planned
Description
Your current environment
ERROR:asyncio:Exception in callback functools.partial(<function _raise_exception_on_finish at 0x7fd321802200>, error_ca
llback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7fd3164
50310>>)
handle: <Handle functools.partial(<function _raise_exception_on_finish at 0x7fd321802200>, error_callback=<bound method
AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7fd316450310>>)>
Traceback (most recent call last):
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 40, in _raise_ex
ception_on_finish
task.result()
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 521, in run_engi
ne_loop
has_requests_in_progress = await asyncio.wait_for(
^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/asyncio/tasks.py", line 479, in wait_for
return fut.result()
^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 495, in engine_s
tep
request_outputs = await self.engine.step_async()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 226, in step_asy
nc
output = await self.model_executor.execute_model_async(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/executor/gpu_executor.py", line 117, in execute_mo
del_async
output = await make_async(self.driver_worker.execute_model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_cont
ext
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/worker/worker.py", line 272, in execute_model
output = self.model_runner.execute_model(seq_group_metadata_list,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_cont
ext
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 731, in execute_mode
l
logits = self.model.compute_logits(hidden_states, sampling_metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/model_executor/models/llama.py", line 369, in comp
ute_logits
logits = self.logits_processor(self.lm_head.weight, hidden_states,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_cal
l_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/lora/layers.py", line 1195, in forward
return type(self.base_layer).forward(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/model_executor/layers/logits_processor.py", line 5
5, in forward
logits *= self.scale
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "uvloop/cbhandles.pyx", line 63, in uvloop.loop.Handle._run
File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 47, in _raise_ex
ception_on_finish
raise AsyncEngineDeadError(
vllm.engine.async_llm_engine.AsyncEngineDeadError: Task finished unexpectedly. This should never happen! Please open an
issue on Github. See stack trace above for the actual cause.
ESC[31mERRORESC[0m: Exception in ASGI application
+ Exception Group Traceback (most recent call last):
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/_utils.py", line 77, in collapse_excgroup
s
| yield
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/middleware/base.py", line 186, in __call_
_
| async with anyio.create_task_group() as task_group:
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/anyio/_backends/_asyncio.py", line 680, in __aexit_
_
| raise BaseExceptionGroup(
| ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 4
01, in run_asgi
| result = await app( # type: ignore[func-returns-value]
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/uvicorn/middleware/proxy_headers.py", line 70, in
__call__
| return await self.app(scope, receive, send)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/fastapi/applications.py", line 1054, in __call__
| await super().__call__(scope, receive, send)
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/applications.py", line 113, in __call__
| await self.middleware_stack(scope, receive, send)
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/middleware/errors.py", line 187, in __c
all__
| raise exc
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/middleware/errors.py", line 165, in __c
all__
| await self.app(scope, receive, _send)
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/middleware/base.py", line 185, in __cal
l__
| with collapse_excgroups():
| File "/data/tangjiakai/anaconda3/lib/python3.11/contextlib.py", line 155, in __exit__
| self.gen.throw(typ, value, traceback)
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/_utils.py", line 83, in collapse_excgro
ups
| raise exc
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/middleware/base.py", line 187, in __cal
l__
| response = await self.dispatch_func(request, call_next)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 164,
in authentication
| return await call_next(request)
| ^^^^^^^^^^^^^^^^^^^^^^^^
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/middleware/base.py", line 163, in call_
next
| raise app_exc
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/starlette/middleware/base.py", line 187, in __cal
l__
| response = await self.dispatch_func(request, call_next)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/data/tangjiakai/anaconda3/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 164,
in authentication
| return await call_next(request)
| ^^^^^^^^^^^^^^^^^^^^^^^^
🐛 Describe the bug
I run the api server with the following cmd:
python -m vllm.entrypoints.openai.api_server \
--model /data/pretrain_dir/Meta-Llama-3-8B-Instruct \
--trust-remote-code \
--port 8083 \
--dtype auto \
--pipeline-parallel-size 1 \
--enforce-eager \
--enable-prefix-caching \
--enable-lora \
--gpu-memory-utilization 0.6
Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.