You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
trace Llama-3.2-1B using NXD as shown in the generation_demo.py example.
try to deploy in a Docker container using this command vLLM command:
vllm serve /mnt/models
--device neuron
--tensor-parallel-size 2
--max-model-len 64
--max-num-seqs 32
you will see the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 388, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 138, in from_engine_args
return cls(
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 78, in init
self.engine = LLMEngine(*args,
File "/app/vllm/vllm/vllm/engine/llm_engine.py", line 325, in init
self.model_executor = executor_class(
File "/app/vllm/vllm/vllm/executor/executor_base.py", line 47, in init
self._init_executor()
File "/app/vllm/vllm/vllm/executor/neuron_executor.py", line 23, in _init_executor
self._init_worker()
File "/app/vllm/vllm/vllm/executor/neuron_executor.py", line 39, in _init_worker
self.driver_worker.load_model()
File "/app/vllm/vllm/vllm/worker/neuron_worker.py", line 134, in load_model
self.model_runner.load_model()
File "/app/vllm/vllm/vllm/worker/neuron_model_runner.py", line 120, in load_model
self.model = get_neuron_model(
File "/app/vllm/vllm/vllm/model_executor/model_loader/neuron.py", line 301, in get_neuron_model
model.load_weights(model_config.model,
File "/app/vllm/vllm/vllm/model_executor/model_loader/neuron.py", line 115, in load_weights
self.model.to_neuron()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/base.py", line 84, in to_neuron
self.load_weights()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/llama/model.py", line 118, in load_weights
self.materialize_embeddings()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/llama/model.py", line 234, in materialize_embeddings
self.chkpt_model.model.embed_tokens.materialize()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/module.py", line 80, in materialize
raise FileNotFoundError(f'Could not find a weight for {param._global_key} in {param._file_path}')
FileNotFoundError: Could not find a weight for model.embed_tokens.weight in /mnt/models/model-0002-of-0002.safetensors
The text was updated successfully, but these errors were encountered:
To reproduce:
trace Llama-3.2-1B using NXD as shown in the
generation_demo.py
example.try to deploy in a Docker container using this command vLLM command:
vllm serve /mnt/models
--device neuron
--tensor-parallel-size 2
--max-model-len 64
--max-num-seqs 32
you will see the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 388, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 138, in from_engine_args
return cls(
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 78, in init
self.engine = LLMEngine(*args,
File "/app/vllm/vllm/vllm/engine/llm_engine.py", line 325, in init
self.model_executor = executor_class(
File "/app/vllm/vllm/vllm/executor/executor_base.py", line 47, in init
self._init_executor()
File "/app/vllm/vllm/vllm/executor/neuron_executor.py", line 23, in _init_executor
self._init_worker()
File "/app/vllm/vllm/vllm/executor/neuron_executor.py", line 39, in _init_worker
self.driver_worker.load_model()
File "/app/vllm/vllm/vllm/worker/neuron_worker.py", line 134, in load_model
self.model_runner.load_model()
File "/app/vllm/vllm/vllm/worker/neuron_model_runner.py", line 120, in load_model
self.model = get_neuron_model(
File "/app/vllm/vllm/vllm/model_executor/model_loader/neuron.py", line 301, in get_neuron_model
model.load_weights(model_config.model,
File "/app/vllm/vllm/vllm/model_executor/model_loader/neuron.py", line 115, in load_weights
self.model.to_neuron()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/base.py", line 84, in to_neuron
self.load_weights()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/llama/model.py", line 118, in load_weights
self.materialize_embeddings()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/llama/model.py", line 234, in materialize_embeddings
self.chkpt_model.model.embed_tokens.materialize()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/module.py", line 80, in materialize
raise FileNotFoundError(f'Could not find a weight for {param._global_key} in {param._file_path}')
FileNotFoundError: Could not find a weight for model.embed_tokens.weight in /mnt/models/model-0002-of-0002.safetensors
The text was updated successfully, but these errors were encountered: