Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 203 additions & 0 deletions extension/llm/runner/llm_runner_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <executorch/extension/llm/runner/multimodal_runner.h>
#include <executorch/extension/llm/runner/stats.h>
#include <executorch/extension/llm/runner/text_llm_runner.h>
#include <executorch/extension/llm/runner/text_llm_session.h>
#include <executorch/extension/llm/runner/text_prefiller.h>
#include <executorch/extension/llm/runner/text_token_generator.h>
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
Expand All @@ -29,8 +30,18 @@
namespace executorch::extension::llm {

using ::executorch::extension::Module;
using ::executorch::extension::Program;
using ::executorch::runtime::Error;

// Assembles the per-Module components (decoder/prefiller/token generator/io
// manager/stats) into a TextLLMRunner. Shared by the path-based and the
// shared-Program (TextLLMEngine session) construction paths.
static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
std::unique_ptr<Module> module,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name);

std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
const std::string& tokenizer_path,
std::unique_ptr<std::vector<std::string>> special_tokens,
Expand Down Expand Up @@ -251,6 +262,15 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
max_cached_memory_size_bytes_));
}

return assemble_text_llm_runner(
std::move(module), std::move(tokenizer), temperature, method_name);
}

static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
std::unique_ptr<Module> module,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name) {
// Get metadata from Module
ET_LOG(Info, "Reading metadata from model");
auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
Expand Down Expand Up @@ -305,6 +325,189 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
temperature);
}

std::unique_ptr<TextLLMRunner> create_text_llm_runner_from_program(
std::shared_ptr<Program> program,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name) {
if (!tokenizer || !tokenizer->is_loaded()) {
ET_LOG(Error, "Tokenizer is null or not loaded");
return nullptr;
}
if (!program) {
ET_LOG(Error, "Program is null");
return nullptr;
}
// A Module over the already-loaded Program: it reuses that Program rather
// than re-loading it, and its loaded method allocates its own planned (KV)
// memory. Whether packed weights are physically shared vs. re-materialized
// per method instance is backend-dependent (serving_capacity() is the
// authority).
constexpr uint32_t kMaxCachedMemoryBytes = 1024 * 1024 * 10; // 10MB
auto module = std::make_unique<Module>(
std::move(program),
nullptr, // memory allocator
std::make_unique<executorch::extension::CPUCachingAllocator>(
kMaxCachedMemoryBytes));
return assemble_text_llm_runner(
std::move(module), std::move(tokenizer), temperature, method_name);
}

namespace detail {
// The TextLLM adapter: implements the model-agnostic LLMSession over a
// TextLLMRunner. TextLLMRunner's token-step methods are private; this adapter
// is their only (friended) caller, so the engine and server depend solely on
// LLMSession.
TextLLMSession::TextLLMSession(std::unique_ptr<TextLLMRunner> runner)
: runner_(std::move(runner)) {}

Error TextLLMSession::prefill_tokens(
std::vector<uint64_t> tokens,
const SamplingConfig* initial_sampling) {
// The model samples the FIRST generated token during prefill, so apply the
// request's sampling here (not a stale default). Only temperature is
// plumbed; reject non-default top_p/top_k/seed for parity with decode_one().
float temperature = -1.0f;
if (initial_sampling != nullptr) {
if (initial_sampling->top_p != 1.0f || initial_sampling->top_k != 0 ||
initial_sampling->seed != 0) {
ET_LOG(
Error,
"TextLLMSession: only temperature is supported; top_p/top_k/seed "
"are not yet implemented");
return ::executorch::runtime::Error::NotSupported;
}
temperature = initial_sampling->temperature;
}
return runner_->prefill_tokens(std::move(tokens), temperature).error();
}

::executorch::runtime::Result<DecodeResult> TextLLMSession::decode_one(
const SamplingConfig& sampling) {
// Only temperature is plumbed today; top_p/top_k/seed need a per-session
// sampler (a follow-up). Reject non-default values rather than silently
// ignoring them, so callers can't assume constraints are applied.
if (sampling.top_p != 1.0f || sampling.top_k != 0 || sampling.seed != 0) {
ET_LOG(
Error,
"TextLLMSession: only temperature is supported; top_p/top_k/seed are "
"not yet implemented");
return ::executorch::runtime::Error::NotSupported;
}
return runner_->decode_one(sampling.temperature);
}

Error TextLLMSession::seek(int64_t pos) {
return runner_->seek(pos);
}

int64_t TextLLMSession::position() const {
return runner_->position();
}

Error TextLLMSession::reset() {
runner_->reset();
return Error::Ok;
}

void TextLLMSession::stop() {
runner_->stop();
}
} // namespace detail

TextLLMEngine::TextLLMEngine(
std::unique_ptr<Module> loader_module,
std::shared_ptr<Program> program,
std::string tokenizer_path,
float temperature,
std::string method_name,
std::unordered_map<std::string, int64_t> metadata)
: loader_module_(std::move(loader_module)),
program_(std::move(program)),
tokenizer_path_(std::move(tokenizer_path)),
temperature_(temperature),
method_name_(std::move(method_name)),
metadata_(std::move(metadata)) {}

std::unique_ptr<TextLLMEngine> TextLLMEngine::create(
const std::string& model_path,
const std::string& tokenizer_path,
std::optional<const std::string> data_path,
float temperature,
const std::string& method_name,
Module::LoadMode load_mode) {
// External .ptd weights are not yet supported for shared sessions: each
// session Module built from the shared Program would also need the
// data_map_loader threaded into its load_method() to resolve external
// weights (see Module::load_method merged_data_map_). Fail loudly rather than
// silently produce sessions that error on first generate.
if (data_path.has_value()) {
ET_LOG(
Error,
"TextLLMEngine: external data_path (.ptd) is not yet supported for "
"shared sessions; use a self-contained .pte for now.");
return nullptr;
}
// Load the program ONCE; sessions reuse it (loaded a single time, per-session
// KV). Physical weight sharing across sessions is backend-dependent — see
// serving_capacity().
auto loader_module = std::make_unique<Module>(model_path, load_mode);
if (loader_module->load() != Error::Ok) {
ET_LOG(
Error,
"TextLLMEngine: failed to load program from %s",
model_path.c_str());
return nullptr;
}
auto program = loader_module->program();
if (!program) {
ET_LOG(Error, "TextLLMEngine: program is null after load");
return nullptr;
}
// Read model-level metadata once (shared by all sessions).
auto meta_tokenizer = load_tokenizer(tokenizer_path);
if (!meta_tokenizer) {
ET_LOG(
Error,
"TextLLMEngine: failed to load tokenizer from %s",
tokenizer_path.c_str());
return nullptr;
}
auto metadata_result =
get_llm_metadata(meta_tokenizer.get(), loader_module.get());
if (metadata_result.error() != Error::Ok) {
ET_LOG(Error, "TextLLMEngine: failed to read metadata");
return nullptr;
}
return std::unique_ptr<TextLLMEngine>(new TextLLMEngine(
std::move(loader_module),
std::move(program),
tokenizer_path,
temperature,
method_name,
metadata_result.get()));
}

::executorch::runtime::Result<std::unique_ptr<LLMSession>>
TextLLMEngine::create_session() {
auto tokenizer = load_tokenizer(tokenizer_path_);
if (!tokenizer) {
ET_LOG(
Error,
"TextLLMEngine: failed to load tokenizer from %s",
tokenizer_path_.c_str());
return Error::InvalidState;
}
auto runner = create_text_llm_runner_from_program(
program_, std::move(tokenizer), temperature_, method_name_);
if (!runner) {
ET_LOG(Error, "TextLLMEngine: failed to build session runner");
return Error::InvalidState;
}
return std::unique_ptr<LLMSession>(
std::make_unique<detail::TextLLMSession>(std::move(runner)));
}

std::unique_ptr<MultimodalRunner> create_multimodal_runner(
const std::string& model_path,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
Expand Down
84 changes: 84 additions & 0 deletions extension/llm/runner/llm_runner_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <vector>

#include <executorch/extension/llm/runner/constants.h>
#include <executorch/extension/llm/runner/llm_session.h>
#include <executorch/extension/module/module.h>
#include <executorch/runtime/core/result.h>
#include <executorch/runtime/platform/compiler.h>
Expand Down Expand Up @@ -141,6 +142,89 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
const std::string& method_name = "forward",
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

/**
* @brief Creates a TextLLMRunner over an already-loaded Program.
*
* Unlike create_text_llm_runner(model_path, ...), this does not load the model
* file again: the resulting runner's Module reuses `program` while owning its
* own method state and KV cache. This is the per-session construction path for
* TextLLMEngine — N sessions reuse one loaded Program but isolate their mutable
* KV state. Whether they also avoid re-materializing packed weights per session
* is backend-dependent (serving_capacity() is authoritative).
*
* The caller must keep the DataLoader backing `program` alive for the lifetime
* of every runner created from it (TextLLMEngine holds the loader Module).
*
* @param program Shared, already-loaded program.
* @param tokenizer Initialized tokenizer instance (owned by the new runner).
* @param temperature Optional temperature (deprecated; prefer
* GenerationConfig).
* @param method_name Name of the method to execute in the model.
* @return std::unique_ptr<TextLLMRunner> on success, or nullptr on failure.
*/
ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner>
create_text_llm_runner_from_program(
std::shared_ptr<Program> program,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature = -1.0f,
const std::string& method_name = "forward");

/**
* @brief Engine for multi-session text generation over one loaded Program.
*
* Loads the model's Program (weights/constants) once; create_session() builds a
* TextLLMRunner that reuses that Program but owns its own method/KV state. This
* is the correctness-first foundation for serving multiple conversations.
* Backend execution should be serialized by the caller until per-backend thread
* safety is proven (Module::execute is not assumed thread-safe). Whether extra
* sessions avoid duplicating packed weights is backend-dependent and reported
* by serving_capacity() (conservatively one).
*/
class ET_EXPERIMENTAL TextLLMEngine : public LLMEngine {
public:
static std::unique_ptr<TextLLMEngine> create(
const std::string& model_path,
const std::string& tokenizer_path,
std::optional<const std::string> data_path = std::nullopt,
float temperature = -1.0f,
const std::string& method_name = "forward",
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

// Returns a TextLLMSession (LLMSession) that reuses this engine's loaded
// Program (physical weight sharing is backend-dependent; see
// serving_capacity).
::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
override;
// Conservative: a single physical session (no proven cross-session weight
// sharing). Raise on a backend proven to share packed weights.
LLMServingCapacity serving_capacity() const override {
return LLMServingCapacity{};
}
const std::unordered_map<std::string, int64_t>& metadata() const override {
return metadata_;
}

TextLLMEngine(const TextLLMEngine&) = delete;
TextLLMEngine& operator=(const TextLLMEngine&) = delete;

private:
TextLLMEngine(
std::unique_ptr<Module> loader_module,
std::shared_ptr<Program> program,
std::string tokenizer_path,
float temperature,
std::string method_name,
std::unordered_map<std::string, int64_t> metadata);

// Keeps the shared Program's DataLoader alive for the lifetime of sessions.
std::unique_ptr<Module> loader_module_;
std::shared_ptr<Program> program_;
std::string tokenizer_path_;
float temperature_;
std::string method_name_;
std::unordered_map<std::string, int64_t> metadata_;
};

/**
* @brief Creates a MultimodalRunner instance with dependency injection
*
Expand Down
Loading
Loading