pytorch · mergennachin · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -15,6 +15,7 @@
 #include <executorch/extension/llm/runner/multimodal_runner.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/extension/llm/runner/text_llm_session.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
 #include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
@@ -29,8 +30,18 @@
 namespace executorch::extension::llm {
 
 using ::executorch::extension::Module;
+using ::executorch::extension::Program;
 using ::executorch::runtime::Error;
 
+// Assembles the per-Module components (decoder/prefiller/token generator/io
+// manager/stats) into a TextLLMRunner. Shared by the path-based and the
+// shared-Program (TextLLMEngine session) construction paths.
+static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
+    std::unique_ptr<Module> module,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    float temperature,
+    const std::string& method_name);
+
 std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
     const std::string& tokenizer_path,
     std::unique_ptr<std::vector<std::string>> special_tokens,
@@ -251,6 +262,15 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
             max_cached_memory_size_bytes_));
   }
 
+  return assemble_text_llm_runner(
+      std::move(module), std::move(tokenizer), temperature, method_name);
+}
+
+static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
+    std::unique_ptr<Module> module,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    float temperature,
+    const std::string& method_name) {
   // Get metadata from Module
   ET_LOG(Info, "Reading metadata from model");
   auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
@@ -305,6 +325,189 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
       temperature);
 }
 
+std::unique_ptr<TextLLMRunner> create_text_llm_runner_from_program(
+    std::shared_ptr<Program> program,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    float temperature,
+    const std::string& method_name) {
+  if (!tokenizer || !tokenizer->is_loaded()) {
+    ET_LOG(Error, "Tokenizer is null or not loaded");
+    return nullptr;
+  }
+  if (!program) {
+    ET_LOG(Error, "Program is null");
+    return nullptr;
+  }
+  // A Module over the already-loaded Program: it reuses that Program rather
+  // than re-loading it, and its loaded method allocates its own planned (KV)
+  // memory. Whether packed weights are physically shared vs. re-materialized
+  // per method instance is backend-dependent (serving_capacity() is the
+  // authority).
+  constexpr uint32_t kMaxCachedMemoryBytes = 1024 * 1024 * 10; // 10MB
+  auto module = std::make_unique<Module>(
+      std::move(program),
+      nullptr, // memory allocator
+      std::make_unique<executorch::extension::CPUCachingAllocator>(
+          kMaxCachedMemoryBytes));
+  return assemble_text_llm_runner(
+      std::move(module), std::move(tokenizer), temperature, method_name);
+}
+
+namespace detail {
+// The TextLLM adapter: implements the model-agnostic LLMSession over a
+// TextLLMRunner. TextLLMRunner's token-step methods are private; this adapter
+// is their only (friended) caller, so the engine and server depend solely on
+// LLMSession.
+TextLLMSession::TextLLMSession(std::unique_ptr<TextLLMRunner> runner)
+    : runner_(std::move(runner)) {}
+
+Error TextLLMSession::prefill_tokens(
+    std::vector<uint64_t> tokens,
+    const SamplingConfig* initial_sampling) {
+  // The model samples the FIRST generated token during prefill, so apply the
+  // request's sampling here (not a stale default). Only temperature is
+  // plumbed; reject non-default top_p/top_k/seed for parity with decode_one().
+  float temperature = -1.0f;
+  if (initial_sampling != nullptr) {
+    if (initial_sampling->top_p != 1.0f || initial_sampling->top_k != 0 ||
+        initial_sampling->seed != 0) {
+      ET_LOG(
+          Error,
+          "TextLLMSession: only temperature is supported; top_p/top_k/seed "
+          "are not yet implemented");
+      return ::executorch::runtime::Error::NotSupported;
+    }
+    temperature = initial_sampling->temperature;
+  }
+  return runner_->prefill_tokens(std::move(tokens), temperature).error();
+}
+
+::executorch::runtime::Result<DecodeResult> TextLLMSession::decode_one(
+    const SamplingConfig& sampling) {
+  // Only temperature is plumbed today; top_p/top_k/seed need a per-session
+  // sampler (a follow-up). Reject non-default values rather than silently
+  // ignoring them, so callers can't assume constraints are applied.
+  if (sampling.top_p != 1.0f || sampling.top_k != 0 || sampling.seed != 0) {
+    ET_LOG(
+        Error,
+        "TextLLMSession: only temperature is supported; top_p/top_k/seed are "
+        "not yet implemented");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+  return runner_->decode_one(sampling.temperature);
+}
+
+Error TextLLMSession::seek(int64_t pos) {
+  return runner_->seek(pos);
+}
+
+int64_t TextLLMSession::position() const {
+  return runner_->position();
+}
+
+Error TextLLMSession::reset() {
+  runner_->reset();
+  return Error::Ok;
+}
+
+void TextLLMSession::stop() {
+  runner_->stop();
+}
+} // namespace detail
+
+TextLLMEngine::TextLLMEngine(
+    std::unique_ptr<Module> loader_module,
+    std::shared_ptr<Program> program,
+    std::string tokenizer_path,
+    float temperature,
+    std::string method_name,
+    std::unordered_map<std::string, int64_t> metadata)
+    : loader_module_(std::move(loader_module)),
+      program_(std::move(program)),
+      tokenizer_path_(std::move(tokenizer_path)),
+      temperature_(temperature),
+      method_name_(std::move(method_name)),
+      metadata_(std::move(metadata)) {}
+
+std::unique_ptr<TextLLMEngine> TextLLMEngine::create(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::optional<const std::string> data_path,
+    float temperature,
+    const std::string& method_name,
+    Module::LoadMode load_mode) {
+  // External .ptd weights are not yet supported for shared sessions: each
+  // session Module built from the shared Program would also need the
+  // data_map_loader threaded into its load_method() to resolve external
+  // weights (see Module::load_method merged_data_map_). Fail loudly rather than
+  // silently produce sessions that error on first generate.
+  if (data_path.has_value()) {
+    ET_LOG(
+        Error,
+        "TextLLMEngine: external data_path (.ptd) is not yet supported for "
+        "shared sessions; use a self-contained .pte for now.");
+    return nullptr;
+  }
+  // Load the program ONCE; sessions reuse it (loaded a single time, per-session
+  // KV). Physical weight sharing across sessions is backend-dependent — see
+  // serving_capacity().
+  auto loader_module = std::make_unique<Module>(model_path, load_mode);
+  if (loader_module->load() != Error::Ok) {
+    ET_LOG(
+        Error,
+        "TextLLMEngine: failed to load program from %s",
+        model_path.c_str());
+    return nullptr;
+  }
+  auto program = loader_module->program();
+  if (!program) {
+    ET_LOG(Error, "TextLLMEngine: program is null after load");
+    return nullptr;
+  }
+  // Read model-level metadata once (shared by all sessions).
+  auto meta_tokenizer = load_tokenizer(tokenizer_path);
+  if (!meta_tokenizer) {
+    ET_LOG(
+        Error,
+        "TextLLMEngine: failed to load tokenizer from %s",
+        tokenizer_path.c_str());
+    return nullptr;
+  }
+  auto metadata_result =
+      get_llm_metadata(meta_tokenizer.get(), loader_module.get());
+  if (metadata_result.error() != Error::Ok) {
+    ET_LOG(Error, "TextLLMEngine: failed to read metadata");
+    return nullptr;
+  }
+  return std::unique_ptr<TextLLMEngine>(new TextLLMEngine(
+      std::move(loader_module),
+      std::move(program),
+      tokenizer_path,
+      temperature,
+      method_name,
+      metadata_result.get()));
+}
+
+::executorch::runtime::Result<std::unique_ptr<LLMSession>>
+TextLLMEngine::create_session() {
+  auto tokenizer = load_tokenizer(tokenizer_path_);
+  if (!tokenizer) {
+    ET_LOG(
+        Error,
+        "TextLLMEngine: failed to load tokenizer from %s",
+        tokenizer_path_.c_str());
+    return Error::InvalidState;
+  }
+  auto runner = create_text_llm_runner_from_program(
+      program_, std::move(tokenizer), temperature_, method_name_);
+  if (!runner) {
+    ET_LOG(Error, "TextLLMEngine: failed to build session runner");
+    return Error::InvalidState;
+  }
+  return std::unique_ptr<LLMSession>(
+      std::make_unique<detail::TextLLMSession>(std::move(runner)));
+}
+
 std::unique_ptr<MultimodalRunner> create_multimodal_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,

@@ -18,6 +18,7 @@
 #include <vector>
 
 #include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/llm_session.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
@@ -141,6 +142,89 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     const std::string& method_name = "forward",
     Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);
 
+/**
+ * @brief Creates a TextLLMRunner over an already-loaded Program.
+ *
+ * Unlike create_text_llm_runner(model_path, ...), this does not load the model
+ * file again: the resulting runner's Module reuses `program` while owning its
+ * own method state and KV cache. This is the per-session construction path for
+ * TextLLMEngine — N sessions reuse one loaded Program but isolate their mutable
+ * KV state. Whether they also avoid re-materializing packed weights per session
+ * is backend-dependent (serving_capacity() is authoritative).
+ *
+ * The caller must keep the DataLoader backing `program` alive for the lifetime
+ * of every runner created from it (TextLLMEngine holds the loader Module).
+ *
+ * @param program Shared, already-loaded program.
+ * @param tokenizer Initialized tokenizer instance (owned by the new runner).
+ * @param temperature Optional temperature (deprecated; prefer
+ * GenerationConfig).
+ * @param method_name Name of the method to execute in the model.
+ * @return std::unique_ptr<TextLLMRunner> on success, or nullptr on failure.
+ */
+ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner>
+create_text_llm_runner_from_program(
+    std::shared_ptr<Program> program,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    float temperature = -1.0f,
+    const std::string& method_name = "forward");
+
+/**
+ * @brief Engine for multi-session text generation over one loaded Program.
+ *
+ * Loads the model's Program (weights/constants) once; create_session() builds a
+ * TextLLMRunner that reuses that Program but owns its own method/KV state. This
+ * is the correctness-first foundation for serving multiple conversations.
+ * Backend execution should be serialized by the caller until per-backend thread
+ * safety is proven (Module::execute is not assumed thread-safe). Whether extra
+ * sessions avoid duplicating packed weights is backend-dependent and reported
+ * by serving_capacity() (conservatively one).
+ */
+class ET_EXPERIMENTAL TextLLMEngine : public LLMEngine {
+ public:
+  static std::unique_ptr<TextLLMEngine> create(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      std::optional<const std::string> data_path = std::nullopt,
+      float temperature = -1.0f,
+      const std::string& method_name = "forward",
+      Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);
+
+  // Returns a TextLLMSession (LLMSession) that reuses this engine's loaded
+  // Program (physical weight sharing is backend-dependent; see
+  // serving_capacity).
+  ::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
+      override;
+  // Conservative: a single physical session (no proven cross-session weight
+  // sharing). Raise on a backend proven to share packed weights.
+  LLMServingCapacity serving_capacity() const override {
+    return LLMServingCapacity{};
+  }
+  const std::unordered_map<std::string, int64_t>& metadata() const override {
+    return metadata_;
+  }
+
+  TextLLMEngine(const TextLLMEngine&) = delete;
+  TextLLMEngine& operator=(const TextLLMEngine&) = delete;
+
+ private:
+  TextLLMEngine(
+      std::unique_ptr<Module> loader_module,
+      std::shared_ptr<Program> program,
+      std::string tokenizer_path,
+      float temperature,
+      std::string method_name,
+      std::unordered_map<std::string, int64_t> metadata);
+
+  // Keeps the shared Program's DataLoader alive for the lifetime of sessions.
+  std::unique_ptr<Module> loader_module_;
+  std::shared_ptr<Program> program_;
+  std::string tokenizer_path_;
+  float temperature_;
+  std::string method_name_;
+  std::unordered_map<std::string, int64_t> metadata_;
+};
+
 /**
  * @brief Creates a MultimodalRunner instance with dependency injection
  *