Merge pull request #641 from RyanMetcalfeInt8/ryan/stateful_llm_chat_mode_support

ankitm3k · web-flow · commit 52f147625bf6 · 2025-04-07T22:30:11.000+05:30
Ryan/stateful llm chat mode support
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -550,5 +550,11 @@ void BackendManager::ShutdownBackendManager() {
   concrete_backend_.reset();
 }
 
+void BackendManager::RewindKVCache(size_t index) {
+  if (concrete_backend_) {
+    concrete_backend_->RewindKVCache(index);
+  }
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -30,6 +30,7 @@ class BackendManager {
   SessionContext& GetSessionContext();
   Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph);
   ov::CompiledModel& GetOVCompiledModel();
+  void RewindKVCache(size_t index);
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -358,6 +358,13 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
     device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
 }
 
+void BasicBackend::RewindKVCache(size_t index) {
+  OVInferRequestPtr infer_request;
+  infer_request = inferRequestsQueue_->getIdleRequest();
+  infer_request->RewindKVCache(index);
+  inferRequestsQueue_->putIdleRequest(std::move(infer_request));
+}
+
 // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on
 // an Infer Request indexed by infer_req_idx
 void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -41,6 +41,7 @@ class BasicBackend : public IBackend {
   ov::CompiledModel& GetOVCompiledModel() override {
     return exe_network_.Get();
   }
+  void RewindKVCache(size_t index) override;
 
  private:
   void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&);
@@ -78,7 +79,7 @@ class InferRequestsQueue {
   InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function<void(OVInferRequestPtr)> initializer) {
     OVInferRequestPtr infer_request;
     for (size_t id = 0; id < nireq; id++) {
-      infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
+      infer_request = net.CreateInferRequest();
       initializer(infer_request);
       infer_requests_.push_back(infer_request);
     }
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
@@ -17,6 +17,7 @@ class IBackend {
   virtual void Infer(OrtKernelContext* context) = 0;
   virtual ov::CompiledModel& GetOVCompiledModel() = 0;
   virtual ~IBackend() = default;
+  virtual void RewindKVCache(size_t index) {};
 };
 using ptr_stream_t = std::unique_ptr<std::istream>;
 class BackendFactory {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -294,6 +294,24 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
           ov_compiled_model.set_property(ov::workload_type(workload_type));
         }
       }
+    } else if (key == "kvcache_rewind") {
+      // convert kvcache_rewind value to int64_t
+      int64_t index;
+      try {
+        index = std::stoll(value);
+      } catch (const std::exception& e) {
+        LOGS_DEFAULT(WARNING) << "Could not convert kvcache_rewind value string to index. Exception: " + std::string(e.what());
+        return Status::OK();
+      }
+
+      // Trigger KVCache rewind for backed
+      for (auto& backend : backend_managers_) {
+        if (index >= 0) {
+          backend.RewindKVCache(static_cast<size_t>(index));
+        } else {
+          LOGS_DEFAULT(WARNING) << "kvcache_rewind index is < 0: " << index;
+        }
+      }
     } else {
       // Handle unknown options
       LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -125,7 +125,7 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
   compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
   std::cout << "Stateful OV Model Compilation Complete" << std::endl;
 
-  OVExeNetwork exe(compiled_model);
+  OVExeNetwork exe(compiled_model, hw_target, true);
   return exe;
 }
 
@@ -134,19 +134,18 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
                                   ov::AnyMap& device_config,
                                   bool enable_causallm,
                                   const std::string& name) {
-  ov::CompiledModel obj;
+  OVExeNetwork exe;
   try {
     if (enable_causallm) {
       auto mutable_model = ie_cnn_network->clone();
-      auto compiled_model = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config);
-      obj = compiled_model.Get();
+      exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config);
     } else {
-      obj = core.compile_model(ie_cnn_network, hw_target, device_config);
+      auto obj = core.compile_model(ie_cnn_network, hw_target, device_config);
+      exe = OVExeNetwork(obj, hw_target);
     }
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
-    OVExeNetwork exe(obj);
     return exe;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
@@ -165,7 +164,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
-    OVExeNetwork exe(obj);
+    OVExeNetwork exe(obj, hw_target);
     return exe;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
@@ -180,7 +179,7 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
                                  bool enable_causallm,
                                  std::string name) {
   try {
-    ov::CompiledModel obj;
+    OVExeNetwork exe;
 
     // Check if it's XML
     std::streampos originalPos = model_stream.tellg();
@@ -194,7 +193,8 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
     model_stream.seekg(originalPos);
 
     if (header != "<?xml") {
-      obj = core.import_model(model_stream, hw_target, device_config);
+      auto obj = core.import_model(model_stream, hw_target, device_config);
+      exe = OVExeNetwork(obj, hw_target);
     } else {
       // Get path to bin file
       std::string bin_file;
@@ -232,17 +232,16 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
       std::shared_ptr<ov::Model> model = core.read_model(xml_content, weights_tensor);
 
       if (enable_causallm) {
-        auto compiled_model = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config);
-        obj = compiled_model.Get();
+        exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config);
       } else {
-        obj = core.compile_model(model, hw_target, device_config);
+        auto obj = core.compile_model(model, hw_target, device_config);
+        exe = OVExeNetwork(obj, hw_target);
       }
     }
 
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
-    OVExeNetwork exe(obj);
     return exe;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
@@ -330,11 +329,16 @@ void OVCore::SetStreams(const std::string& device_type, int num_streams) {
   core.set_property(device_type, {ov::num_streams(num_streams)});
 }
 
-OVInferRequest OVExeNetwork::CreateInferRequest() {
+std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
   try {
     auto infReq = obj.create_infer_request();
-    OVInferRequest inf_obj(std::move(infReq));
-    return inf_obj;
+    std::shared_ptr<OVInferRequest> ovInfReq;
+    if (_stateful_llm) {
+      ovInfReq = std::make_shared<StatefulOVInferRequest>(std::move(infReq), _device);
+    } else {
+      ovInfReq = std::make_shared<OVInferRequest>(std::move(infReq));
+    }
+    return ovInfReq;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + "Exception while creating InferRequest object: " + e.what());
   } catch (...) {
@@ -368,16 +372,6 @@ std::string OVInferRequest::GetInputTensorName(uint32_t index) {
 void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
-
-    if (name == "input_ids") {
-      // Since we can't seem to set at ORT GenAI layer right now, we just set it here
-      // as a workaround.
-      // TODO: Fix this.
-      ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {1});
-      std::fill_n(beam_idx.data<int32_t>(), 1, 0);
-      ovInfReq.set_tensor("beam_idx", beam_idx);
-    }
-
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
   } catch (...) {
@@ -423,5 +417,121 @@ void OVInferRequest::QueryStatus() {
   std::cout << "ovInfReq.query_state()"
             << " ";
 }
+
+void StatefulOVInferRequest::_pre_infer() {
+  // Since we can't seem to set at ORT GenAI layer right now, we just set it here
+  // as a workaround.
+  // TODO: Fix this.
+  ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {1});
+  std::fill_n(beam_idx.data<int32_t>(), 1, 0);
+  ovInfReq.set_tensor("beam_idx", beam_idx);
+
+  // For NPU, we need to cache input_ids and position_ids for
+  // chat-mode support.
+  if (device.find("NPU") != std::string::npos) {
+    auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
+
+    // add input_ids to our cache
+    {
+      auto* pData = input_ids_tensor.data<int64_t>();
+      for (size_t i = 0; i < input_ids_tensor.get_size(); i++) {
+        cached_input_ids.push_back(pData[i]);
+      }
+    }
+
+    // add position_ids to our cache
+    {
+      auto position_ids = ovInfReq.get_tensor("position_ids");
+      auto* pData = position_ids.data<int64_t>();
+      for (size_t i = 0; i < position_ids.get_size(); i++) {
+        cached_position_ids.push_back(pData[i]);
+      }
+    }
+
+    // if we're about to run prefill model
+    if (input_ids_tensor.get_size() > 1) {
+      // if the input_ids size doesn't equal cached size of the input_ids
+      //  then it means that we're running 2nd (or later) prompt.
+      if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) {
+        // set a new input_ids tensor with the content of our cached input_ids
+        {
+          auto new_shape = input_ids_tensor.get_shape();
+          new_shape[1] = cached_input_ids.size();
+          auto new_input_ids = ov::Tensor(input_ids_tensor.get_element_type(), new_shape);
+          auto* pNewInputIds = new_input_ids.data<int64_t>();
+          std::memcpy(pNewInputIds, cached_input_ids.data(), cached_input_ids.size() * sizeof(int64_t));
+          ovInfReq.set_tensor("input_ids", new_input_ids);
+        }
+
+        // set a new position_ids tensor with the content of our cached position_ids
+        {
+          auto position_ids_tensor = ovInfReq.get_tensor("position_ids");
+          auto new_shape = position_ids_tensor.get_shape();
+          new_shape[1] = cached_position_ids.size();
+          auto new_position_ids = ov::Tensor(position_ids_tensor.get_element_type(), new_shape);
+          auto* pNewPositionIds = new_position_ids.data<int64_t>();
+          std::memcpy(pNewPositionIds, cached_position_ids.data(), cached_position_ids.size() * sizeof(int64_t));
+          ovInfReq.set_tensor("position_ids", new_position_ids);
+        }
+      }
+    }
+  }
+}
+
+void StatefulOVInferRequest::StartAsync() {
+  _pre_infer();
+  OVInferRequest::StartAsync();
+}
+
+void StatefulOVInferRequest::Infer() {
+  _pre_infer();
+  OVInferRequest::Infer();
+}
+
+void StatefulOVInferRequest::RewindKVCache(size_t index) {
+  if (device == "NPU") {
+    std::cout << "RewindKVCache on NPU: Trimming cached input_ids / position_ids to length "
+              << index << std::endl;
+    if (cached_input_ids.size() > index) {
+      cached_input_ids.resize(index);
+    }
+
+    if (cached_position_ids.size() > index) {
+      cached_position_ids.resize(index);
+    }
+  } else {
+    std::cout << "OVInferRequest::RewindKVCache: Trimming internal states to length = "
+              << index << std::endl;
+    if (index == 0) {
+      // in this case, since we're trimming *all* of the KVCache, just reset the state.
+      ovInfReq.reset_state();
+    } else {
+      // retrieve kvcache states, and trim...
+      // Most of this code was grabbed from here:
+      // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
+      auto states = ovInfReq.query_state();
+      for (auto& state : states) {
+        ov::Tensor old_tensor = state.get_state();
+        // [BATCH_SIZE, num_kv_heads, seq_len, head_size]
+        auto shape = old_tensor.get_shape();
+
+        if (shape[2] > index) {
+          shape[2] = index;
+
+          ov::Coordinate new_shape_begin{0, 0, 0, 0};
+          ov::Coordinate new_shape_end{shape};
+
+          auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
+
+          ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
+          trimmed_tensor.copy_to(new_tensor);
+
+          state.set_state(new_tensor);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -105,31 +105,53 @@ struct OVCore : WeakSingleton<OVCore> {
 
 class OVExeNetwork {
   ov::CompiledModel obj;
-
+  std::string _device;
+  bool _stateful_llm;
  public:
-  explicit OVExeNetwork(ov::CompiledModel md) : obj(md) {}
+  explicit OVExeNetwork(ov::CompiledModel md, std::string device, bool stateful_llm = false) 
+      : obj(md), _device(device), _stateful_llm(stateful_llm) {}
   OVExeNetwork() : obj(ov::CompiledModel()) {}
   ov::CompiledModel& Get() { return obj; }
-  OVInferRequest CreateInferRequest();
+  std::shared_ptr<OVInferRequest> CreateInferRequest();
 };
 
 class OVInferRequest {
+ protected:
   ov::InferRequest ovInfReq;
 
  public:
   uint32_t GetNumInputs();
   OVTensorPtr GetTensor(const std::string& name);
   std::string GetInputTensorName(uint32_t index);
   void SetTensor(const std::string& name, OVTensorPtr& blob);
-  void StartAsync();
-  void Infer();
+  virtual void StartAsync();
+  virtual void Infer();
   void WaitRequest();
   void QueryStatus();
   explicit OVInferRequest(ov::InferRequest obj) : ovInfReq(std::move(obj)) {}
   OVInferRequest() : ovInfReq(ov::InferRequest()) {}
   ov::InferRequest& GetNewObj() {
     return ovInfReq;
   }
+  virtual void RewindKVCache(size_t index) {};
 };
+
+class StatefulOVInferRequest : public OVInferRequest {
+ public:
+  explicit StatefulOVInferRequest(ov::InferRequest obj, std::string d) : OVInferRequest(std::move(obj)), device(d) {}
+
+  void StartAsync() override;
+  void Infer() override;
+  void RewindKVCache(size_t index) override;
+
+ private:
+  void _pre_infer();
+  std::string device;
+
+  // For NPU, we need to cache input_ids & position_ids to support chat-mode.
+  std::vector<int64_t> cached_input_ids;
+  std::vector<int64_t> cached_position_ids;
+};
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime