@@ -551,7 +551,7 @@ static void ggml_cpy_f16_f16_cuda(
551
551
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
552
552
}
553
553
554
- void ggml_cuda_cpy (ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
554
+ void ggml_cuda_cpy (ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node ) {
555
555
const int64_t ne = ggml_nelements (src0);
556
556
GGML_ASSERT (ne == ggml_nelements (src1));
557
557
@@ -588,7 +588,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
588
588
char ** dest_ptrs_d = nullptr ;
589
589
int graph_cpynode_index = -1 ;
590
590
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
591
- if (ctx.cuda_graph ->use_cpy_indirection ) {
591
+ if (ctx.cuda_graph ->use_cpy_indirection && !disable_indirection_for_this_node ) {
592
592
dest_ptrs_d = ctx.cuda_graph ->dest_ptrs_d ;
593
593
graph_cpynode_index = ctx.cuda_graph ->graph_cpynode_index ;
594
594
}
@@ -636,7 +636,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
636
636
ggml_type_name (src0->type ), ggml_type_name (src1->type ));
637
637
}
638
638
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
639
- if (ctx.cuda_graph ->use_cpy_indirection ) {
639
+ if (ctx.cuda_graph ->use_cpy_indirection && !disable_indirection_for_this_node ) {
640
640
ctx.cuda_graph ->graph_cpynode_index = graph_cpynode_index;
641
641
}
642
642
#endif
@@ -645,7 +645,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
645
645
646
646
void ggml_cuda_dup (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
647
647
const ggml_tensor * src0 = dst->src [0 ];
648
- ggml_cuda_cpy (ctx, src0, dst);
648
+ bool disable_indirection = true ;
649
+ ggml_cuda_cpy (ctx, src0, dst, disable_indirection);
649
650
}
650
651
651
652
void * ggml_cuda_cpy_fn (const ggml_tensor * src0, ggml_tensor * src1) {
0 commit comments