@@ -105,7 +105,7 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
105
105
106
106
if (hw_target.find (" NPU" ) != std::string::npos) {
107
107
KVDesc kv_desc;
108
- kv_desc.max_prompt_len = PopIntAndCast (config, " MAX_PROMPT_LEN" ).value_or (3072u );
108
+ kv_desc.max_prompt_len = PopIntAndCast (config, " MAX_PROMPT_LEN" ).value_or (1024u );
109
109
kv_desc.min_response_len = PopIntAndCast (config, " MIN_RESPONSE_LEN" ).value_or (128u );
110
110
111
111
if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
@@ -488,5 +488,50 @@ void StatefulOVInferRequest::Infer() {
488
488
OVInferRequest::Infer ();
489
489
}
490
490
491
+ void StatefulOVInferRequest::RewindKVCache (size_t index) {
492
+ if (device == " NPU" ) {
493
+ std::cout << " RewindKVCache on NPU: Trimming cached input_ids / position_ids to length "
494
+ << index << std::endl;
495
+ if (cached_input_ids.size () > index) {
496
+ cached_input_ids.resize (index);
497
+ }
498
+
499
+ if (cached_position_ids.size () > index) {
500
+ cached_position_ids.resize (index);
501
+ }
502
+ } else {
503
+ std::cout << " OVInferRequest::RewindKVCache: Trimming internal states to length = "
504
+ << index << std::endl;
505
+ if (index == 0 ) {
506
+ // in this case, since we're trimming *all* of the KVCache, just reset the state.
507
+ ovInfReq.reset_state ();
508
+ } else {
509
+ // retrieve kvcache states, and trim...
510
+ // Most of this code was grabbed from here:
511
+ // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
512
+ auto states = ovInfReq.query_state ();
513
+ for (auto & state : states) {
514
+ ov::Tensor old_tensor = state.get_state ();
515
+ // [BATCH_SIZE, num_kv_heads, seq_len, head_size]
516
+ auto shape = old_tensor.get_shape ();
517
+
518
+ if (shape[2 ] > index) {
519
+ shape[2 ] = index;
520
+
521
+ ov::Coordinate new_shape_begin{0 , 0 , 0 , 0 };
522
+ ov::Coordinate new_shape_end{shape};
523
+
524
+ auto trimmed_tensor = ov::Tensor (old_tensor, new_shape_begin, new_shape_end);
525
+
526
+ ov::Tensor new_tensor (old_tensor.get_element_type (), shape);
527
+ trimmed_tensor.copy_to (new_tensor);
528
+
529
+ state.set_state (new_tensor);
530
+ }
531
+ }
532
+ }
533
+ }
534
+ }
535
+
491
536
} // namespace openvino_ep
492
537
} // namespace onnxruntime
0 commit comments