Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions extension/llm/runner/llm_runner_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,18 @@
namespace executorch::extension::llm {

using ::executorch::extension::Module;
using ::executorch::extension::Program;
using ::executorch::runtime::Error;

// Assembles the per-Module components (decoder/prefiller/token generator/io
// manager/stats) into a TextLLMRunner. Shared by the path-based and the
// shared-Program (TextLLMEngine session) construction paths.
static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
std::unique_ptr<Module> module,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name);

std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
const std::string& tokenizer_path,
std::unique_ptr<std::vector<std::string>> special_tokens,
Expand Down Expand Up @@ -251,6 +261,15 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
max_cached_memory_size_bytes_));
}

return assemble_text_llm_runner(
std::move(module), std::move(tokenizer), temperature, method_name);
}

static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
std::unique_ptr<Module> module,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name) {
// Get metadata from Module
ET_LOG(Info, "Reading metadata from model");
auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
Expand Down Expand Up @@ -305,6 +324,164 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
temperature);
}

std::unique_ptr<TextLLMRunner> create_text_llm_runner_from_program(
std::shared_ptr<Program> program,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name) {
if (!tokenizer || !tokenizer->is_loaded()) {
ET_LOG(Error, "Tokenizer is null or not loaded");
return nullptr;
}
if (!program) {
ET_LOG(Error, "Program is null");
return nullptr;
}
// A Module over the already-loaded Program: it reuses that Program rather
// than re-loading it, and its loaded method allocates its own planned (KV)
// memory. Whether packed weights are physically shared vs. re-materialized
// per method instance is backend-dependent (serving_capacity() is the
// authority); on XNNPACK assume per-instance.
constexpr uint32_t kMaxCachedMemoryBytes = 1024 * 1024 * 10; // 10MB
auto module = std::make_unique<Module>(
std::move(program),
nullptr, // memory allocator
std::make_unique<executorch::extension::CPUCachingAllocator>(
kMaxCachedMemoryBytes));
return assemble_text_llm_runner(
std::move(module), std::move(tokenizer), temperature, method_name);
}

namespace {
// The TextLLM adapter: implements the model-agnostic LLMSession over a
// TextLLMRunner. TextLLMRunner is an implementation detail here — the engine
// and server depend only on LLMSession.
class TextLLMSession : public LLMSession {
public:
explicit TextLLMSession(std::unique_ptr<TextLLMRunner> runner)
: runner_(std::move(runner)) {}

Error prefill_tokens(std::vector<uint64_t> tokens) override {
return runner_->prefill_tokens(std::move(tokens)).error();
}
::executorch::runtime::Result<DecodeResult> decode_one(
const SamplingConfig& sampling) override {
// Only temperature is plumbed today; top_p/top_k/seed need a per-session
// sampler (applied in a follow-up).
return runner_->decode_one(sampling.temperature);
}
Comment thread
mergennachin marked this conversation as resolved.
Outdated
Error seek(int64_t pos) override {
return runner_->seek(pos);
}
int64_t position() const override {
return runner_->position();
}
Error reset() override {
runner_->reset();
return Error::Ok;
}
void stop() override {
runner_->stop();
}

private:
std::unique_ptr<TextLLMRunner> runner_;
};
} // namespace

TextLLMEngine::TextLLMEngine(
std::unique_ptr<Module> loader_module,
std::shared_ptr<Program> program,
std::string tokenizer_path,
float temperature,
std::string method_name,
std::unordered_map<std::string, int64_t> metadata)
: loader_module_(std::move(loader_module)),
program_(std::move(program)),
tokenizer_path_(std::move(tokenizer_path)),
temperature_(temperature),
method_name_(std::move(method_name)),
metadata_(std::move(metadata)) {}

std::unique_ptr<TextLLMEngine> TextLLMEngine::create(
const std::string& model_path,
const std::string& tokenizer_path,
std::optional<const std::string> data_path,
float temperature,
const std::string& method_name,
Module::LoadMode load_mode) {
// External .ptd weights are not yet supported for shared sessions: each
// session Module built from the shared Program would also need the
// data_map_loader threaded into its load_method() to resolve external
// weights (see Module::load_method merged_data_map_). Fail loudly rather than
// silently produce sessions that error on first generate.
if (data_path.has_value()) {
ET_LOG(
Error,
"TextLLMEngine: external data_path (.ptd) is not yet supported for "
"shared sessions; use a self-contained .pte for now.");
return nullptr;
}
// Load the program ONCE; sessions reuse it (loaded a single time, per-session
// KV). Physical weight sharing across sessions is backend-dependent — see
// serving_capacity().
auto loader_module = std::make_unique<Module>(model_path, load_mode);
if (loader_module->load() != Error::Ok) {
ET_LOG(
Error,
"TextLLMEngine: failed to load program from %s",
model_path.c_str());
return nullptr;
}
auto program = loader_module->program();
if (!program) {
ET_LOG(Error, "TextLLMEngine: program is null after load");
return nullptr;
}
// Read model-level metadata once (shared by all sessions).
auto meta_tokenizer = load_tokenizer(tokenizer_path);
if (!meta_tokenizer) {
ET_LOG(
Error,
"TextLLMEngine: failed to load tokenizer from %s",
tokenizer_path.c_str());
return nullptr;
}
auto metadata_result =
get_llm_metadata(meta_tokenizer.get(), loader_module.get());
if (metadata_result.error() != Error::Ok) {
ET_LOG(Error, "TextLLMEngine: failed to read metadata");
return nullptr;
}
return std::unique_ptr<TextLLMEngine>(new TextLLMEngine(
std::move(loader_module),
std::move(program),
tokenizer_path,
temperature,
method_name,
metadata_result.get()));
}

::executorch::runtime::Result<std::unique_ptr<LLMSession>>
TextLLMEngine::create_session() {
auto tokenizer = load_tokenizer(tokenizer_path_);
if (!tokenizer) {
ET_LOG(
Error,
"TextLLMEngine: failed to load tokenizer from %s",
tokenizer_path_.c_str());
return Error::InvalidState;
}
auto runner = create_text_llm_runner_from_program(
program_, std::move(tokenizer), temperature_, method_name_);
if (!runner) {
ET_LOG(Error, "TextLLMEngine: failed to build session runner");
return Error::InvalidState;
}
return std::unique_ptr<LLMSession>(
std::make_unique<TextLLMSession>(std::move(runner)));
}

std::unique_ptr<MultimodalRunner> create_multimodal_runner(
const std::string& model_path,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
Expand Down
88 changes: 88 additions & 0 deletions extension/llm/runner/llm_runner_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <vector>

#include <executorch/extension/llm/runner/constants.h>
#include <executorch/extension/llm/runner/llm_session.h>
#include <executorch/extension/module/module.h>
#include <executorch/runtime/core/result.h>
#include <executorch/runtime/platform/compiler.h>
Expand Down Expand Up @@ -141,6 +142,93 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
const std::string& method_name = "forward",
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

/**
* @brief Creates a TextLLMRunner over an already-loaded Program.
*
* Unlike create_text_llm_runner(model_path, ...), this does not load the model
* file again: the resulting runner's Module reuses `program` while owning its
* own method state and KV cache. This is the per-session construction path for
* TextLLMEngine — N sessions reuse one loaded Program but isolate their mutable
* KV state. Whether they also avoid re-materializing packed weights per session
* is backend-dependent (serving_capacity() is authoritative; XNNPACK repacks
* per method instance, so assume per-session weights there).
*
* The caller must keep the DataLoader backing `program` alive for the lifetime
* of every runner created from it (TextLLMEngine holds the loader Module).
*
* @param program Shared, already-loaded program.
* @param tokenizer Initialized tokenizer instance (owned by the new runner).
* @param temperature Optional temperature (deprecated; prefer
* GenerationConfig).
* @param method_name Name of the method to execute in the model.
* @return std::unique_ptr<TextLLMRunner> on success, or nullptr on failure.
*/
ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner>
create_text_llm_runner_from_program(
std::shared_ptr<Program> program,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature = -1.0f,
const std::string& method_name = "forward");

/**
* @brief Engine for multi-session text generation over one loaded Program.
*
* Loads the model's Program (weights/constants) once; create_session() builds a
* TextLLMRunner that reuses that Program but owns its own method/KV state. This
* is the correctness-first foundation for serving multiple conversations.
* Backend execution should be serialized by the caller until per-backend thread
* safety is proven (Module::execute is not assumed thread-safe). Whether extra
* sessions actually avoid duplicating packed weights is a backend property
* (e.g. AOTI/CUDA share device weights) reported by serving_capacity(); on the
* XNNPACK path weights are repacked per method instance and the KV cache is
* baked into the .pte, so it conservatively reports a single physical session.
*/
class ET_EXPERIMENTAL TextLLMEngine : public LLMEngine {
public:
static std::unique_ptr<TextLLMEngine> create(
const std::string& model_path,
const std::string& tokenizer_path,
std::optional<const std::string> data_path = std::nullopt,
float temperature = -1.0f,
const std::string& method_name = "forward",
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

// Returns a TextLLMSession (LLMSession) that reuses this engine's loaded
// Program (physical weight sharing is backend-dependent; see
// serving_capacity).
::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
override;
// Conservative v1: a self-contained .pte repacks XNNPACK weights per runtime,
// so we don't claim multiple physical sessions share weights. Raise this on a
// backend/artifact proven to share packed weights.
LLMServingCapacity serving_capacity() const override {
return LLMServingCapacity{};
}
const std::unordered_map<std::string, int64_t>& metadata() const override {
return metadata_;
}

TextLLMEngine(const TextLLMEngine&) = delete;
TextLLMEngine& operator=(const TextLLMEngine&) = delete;

private:
TextLLMEngine(
std::unique_ptr<Module> loader_module,
std::shared_ptr<Program> program,
std::string tokenizer_path,
float temperature,
std::string method_name,
std::unordered_map<std::string, int64_t> metadata);

// Keeps the shared Program's DataLoader alive for the lifetime of sessions.
std::unique_ptr<Module> loader_module_;
std::shared_ptr<Program> program_;
std::string tokenizer_path_;
float temperature_;
std::string method_name_;
std::unordered_map<std::string, int64_t> metadata_;
};

/**
* @brief Creates a MultimodalRunner instance with dependency injection
*
Expand Down
Loading
Loading