Description
To reproduce:
-
trace Llama-3.2-1B using NXD as shown in the
generation_demo.py
example. -
try to deploy in a Docker container using this command vLLM command:
vllm serve /mnt/models
--device neuron
--tensor-parallel-size 2
--max-model-len 64
--max-num-seqs 32 -
you will see the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 388, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 138, in from_engine_args
return cls(
File "/app/vllm/vllm/vllm/engine/multiprocessing/engine.py", line 78, in init
self.engine = LLMEngine(*args,
File "/app/vllm/vllm/vllm/engine/llm_engine.py", line 325, in init
self.model_executor = executor_class(
File "/app/vllm/vllm/vllm/executor/executor_base.py", line 47, in init
self._init_executor()
File "/app/vllm/vllm/vllm/executor/neuron_executor.py", line 23, in _init_executor
self._init_worker()
File "/app/vllm/vllm/vllm/executor/neuron_executor.py", line 39, in _init_worker
self.driver_worker.load_model()
File "/app/vllm/vllm/vllm/worker/neuron_worker.py", line 134, in load_model
self.model_runner.load_model()
File "/app/vllm/vllm/vllm/worker/neuron_model_runner.py", line 120, in load_model
self.model = get_neuron_model(
File "/app/vllm/vllm/vllm/model_executor/model_loader/neuron.py", line 301, in get_neuron_model
model.load_weights(model_config.model,
File "/app/vllm/vllm/vllm/model_executor/model_loader/neuron.py", line 115, in load_weights
self.model.to_neuron()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/base.py", line 84, in to_neuron
self.load_weights()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/llama/model.py", line 118, in load_weights
self.materialize_embeddings()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/llama/model.py", line 234, in materialize_embeddings
self.chkpt_model.model.embed_tokens.materialize()
File "/opt/conda/lib/python3.10/site-packages/transformers_neuronx/module.py", line 80, in materialize
raise FileNotFoundError(f'Could not find a weight for {param._global_key} in {param._file_path}')
FileNotFoundError: Could not find a weight for model.embed_tokens.weight in /mnt/models/model-0002-of-0002.safetensors