Skip to content

Commit 77d25ba

Browse files
author
ddchenhao66
committed
[XPU] fix the issue of cache KV transfer process startup failure on non-zero XPU cards
1 parent beb151b commit 77d25ba

File tree

4 files changed

+9
-6
lines changed

4 files changed

+9
-6
lines changed

custom_ops/xpu_ops/src/ops/share_external_data.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ std::vector<paddle::Tensor> ShareExternalData(const paddle::Tensor& input,
3333
int ret = xpu_ipc_open_memhandle(&data_ptr_addr,
3434
*(XPUIpcMemHandle*)&shm->memHandle,
3535
0x01); // NOLINT
36-
PD_CHECK(ret == XPU_SUCCESS, "%s xpu_ipc_open_memhandle failed", shm_name);
36+
PD_CHECK(ret == XPU_SUCCESS, shm_name, " xpu_ipc_open_memhandle failed");
3737
#elif XPURT_VERSION_MAJOR == 4
3838
PD_THROW("kl2 not support prefix cache");
3939
#endif

fastdeploy/cache_manager/cache_transfer_manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def _init_gpu_cache(self, args):
204204
logger.info(f"[rank {self.rank}/{self.n_ranks}] OK! Stop waiting.")
205205

206206
logger.info(f"[rank {self.rank}/{self.n_ranks}] Initializing kv cache for all layers.")
207-
set_device(self.device)
207+
set_device(self.rank)
208208
for i in range(args.num_layers + self.num_extra_layers):
209209
num_gpu_blocks = self.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks
210210
key_name = f"key_caches_{i}_rank{self.rank}.device{self.device}"
@@ -569,7 +569,7 @@ def clear_or_update_caches(self, args):
569569
time.sleep(0.1)
570570

571571
# clear gpu caches
572-
set_device(self.device)
572+
set_device(self.rank)
573573
for name, tensor in self.gpu_cache_kvs.items():
574574
unset_data_ipc(tensor, name, True, False)
575575
self.gpu_cache_kvs.clear()
@@ -640,5 +640,5 @@ def main():
640640
args = parse_args()
641641
rank_id = args.rank + args.local_data_parallel_id * args.mp_num
642642
logger = get_logger("cache_transfer_manager", f"cache_transfer_manager_rank{rank_id}.log")
643-
set_device(args.device_id)
643+
set_device(rank_id)
644644
main()

fastdeploy/model_executor/layers/attention/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import os
1818

1919
from fastdeploy.config import FDConfig
20+
from fastdeploy.platforms import current_platform
2021

2122

2223
def init_rank_and_device_id(fd_config: FDConfig):
@@ -26,7 +27,10 @@ def init_rank_and_device_id(fd_config: FDConfig):
2627
+ fd_config.parallel_config.tensor_parallel_rank
2728
)
2829

29-
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
30+
if current_platform.is_xpu():
31+
cuda_visible_devices = os.getenv("XPU_VISIBLE_DEVICES", None)
32+
else: # default cuda
33+
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
3034

3135
if cuda_visible_devices is None:
3236
device_id = rank

fastdeploy/worker/xpu_model_runner.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1124,7 +1124,6 @@ def _dummy_run(
11241124
self._dummy_prefill_inputs(num_tokens, batch_size)
11251125

11261126
while True:
1127-
return None
11281127
self.execute_model(is_dummy_run=True)
11291128

11301129
if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:

0 commit comments

Comments
 (0)