Merge branch 'main' into export-D108082431

doggeral · web-flow · commit ac645d40f714 · 2026-06-15T16:09:10.000-07:00
diff --git a/extension/llm/runner/README.md b/extension/llm/runner/README.md
@@ -731,7 +731,7 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
 |-----------|------|---------|-------------|
 | `max_new_tokens` | `int32_t` | `-1` | Maximum new tokens to generate (-1 = use available context) |
 | `seq_len` | `int32_t` | `1024` | Total sequence length including prompt |
-| `temperature` | `float` | `0.8f` | Sampling temperature (0.0 = deterministic, 1.0+ = creative) |
+| `temperature` | `float` | `0.8f` | Sampling temperature in [0.0, 1.0] (0.0 = deterministic) |
 | `echo` | `bool` | `true` | Whether to echo the input prompt |
 | `num_bos` | `int8_t` | `1` | Number of beginning-of-sequence tokens |
 | `num_eos` | `int8_t` | `1` | Number of end-of-sequence tokens |
@@ -824,7 +824,7 @@ GenerationConfig config;
 config.temperature = 0.1f;  // Very deterministic
 runner->generate(factual_prompt, config, callback);
 
-config.temperature = 1.2f;  // Very creative
+config.temperature = 1.0f;  // Highest supported temperature
 runner->generate(creative_prompt, config, callback);
 ```
 
diff --git a/extension/llm/runner/test/test_text_llm_runner.cpp b/extension/llm/runner/test/test_text_llm_runner.cpp
@@ -709,6 +709,26 @@ TEST_F(RunnerTest, TextTokenGeneratorProcessorChainMasksMultipleTokens) {
   EXPECT_EQ(generated_tokens, expected);
 }
 
+TEST_F(RunnerTest, TextTokenGeneratorRejectsTemperatureOutOfRange) {
+  auto tokenizer = createMockTokenizer();
+  auto text_decoder_runner = createMockTextDecoderRunner();
+  Stats stats;
+  auto generator = createTextTokenGenerator(
+      tokenizer.get(), text_decoder_runner.get(), &stats);
+
+  std::vector<uint64_t> tokens = {1, 2, 3};
+  EXPECT_CALL(*text_decoder_runner, step(_, _)).Times(0);
+
+  EXPECT_EQ(
+      generator->generate(tokens, 3, 3, -0.1f, [](const std::string&) {})
+          .error(),
+      Error::InvalidArgument);
+  EXPECT_EQ(
+      generator->generate(tokens, 3, 3, 1.1f, [](const std::string&) {})
+          .error(),
+      Error::InvalidArgument);
+}
+
 // Without any processors, greedy argmax picks token 3 (zero-overhead path).
 TEST_F(RunnerTest, TextTokenGeneratorWithoutProcessorPicksArgmax) {
   auto tokenizer = createMockTokenizer();
diff --git a/extension/llm/runner/test/test_text_prefiller.cpp b/extension/llm/runner/test/test_text_prefiller.cpp
@@ -80,7 +80,12 @@ class TextPrefillerTest : public Test {
         ::executorch::runtime::Result<uint64_t>,
         prefill_chunk,
         (std::vector<uint64_t>&, int64_t&),
-        ());
+        (override));
+    MOCK_METHOD(
+        ::executorch::runtime::Result<uint64_t>,
+        prefill_chunk,
+        (std::vector<uint64_t>&, int64_t&, float),
+        (override));
   };
 
   // Create a mock TextPrefiller
@@ -112,27 +117,145 @@ TEST_F(TextPrefillerTest, PrefillCallsPrefillChunkOnceWhenPromptFits) {
   int64_t start_pos = 0;
 
   // Expect prefill_chunk to be called exactly once with the entire prompt
-  EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+  constexpr float temperature = 0.7f;
+  EXPECT_CALL(*prefiller, prefill_chunk(_, _, FloatEq(temperature)))
       .Times(1)
-      .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+      .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos, float temp) {
         // Verify the tokens passed to prefill_chunk
         EXPECT_EQ(tokens.size(), prompt_tokens.size());
         for (size_t i = 0; i < tokens.size(); i++) {
           EXPECT_EQ(tokens[i], prompt_tokens[i]);
         }
         // Verify the position
         EXPECT_EQ(pos, start_pos);
+        EXPECT_EQ(temp, temperature);
         return Result<uint64_t>(42);
       });
 
   // Call prefill
-  auto result = prefiller->prefill(prompt_tokens, start_pos);
+  auto result = prefiller->prefill(prompt_tokens, start_pos, temperature);
 
   // Verify the result
   EXPECT_EQ(result.error(), Error::Ok);
   EXPECT_EQ(result.get(), 42);
 }
 
+TEST_F(TextPrefillerTest, TwoArgumentPrefillUsesGreedyTemperature) {
+  auto prefiller = createMockTextPrefiller(10);
+
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 0;
+
+  EXPECT_CALL(*prefiller, prefill_chunk(_, _, FloatEq(0.0f)))
+      .Times(1)
+      .WillOnce([](std::vector<uint64_t>&, int64_t&, float) {
+        return Result<uint64_t>(42);
+      });
+
+  auto result = prefiller->prefill(prompt_tokens, start_pos);
+
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 42);
+}
+
+TEST_F(TextPrefillerTest, PrefillAcceptsTemperatureBounds) {
+  auto prefiller = createMockTextPrefiller(10);
+
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 0;
+
+  {
+    InSequence seq;
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, FloatEq(0.0f)))
+        .WillOnce([](std::vector<uint64_t>&, int64_t&, float) {
+          return Result<uint64_t>(41);
+        });
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, FloatEq(1.0f)))
+        .WillOnce([](std::vector<uint64_t>&, int64_t&, float) {
+          return Result<uint64_t>(42);
+        });
+  }
+
+  auto greedy = prefiller->prefill(prompt_tokens, start_pos, 0.0f);
+  auto max_temp = prefiller->prefill(prompt_tokens, start_pos, 1.0f);
+
+  EXPECT_EQ(greedy.error(), Error::Ok);
+  EXPECT_EQ(greedy.get(), 41);
+  EXPECT_EQ(max_temp.error(), Error::Ok);
+  EXPECT_EQ(max_temp.get(), 42);
+}
+
+TEST_F(TextPrefillerTest, PrefillRejectsTemperatureOutOfRange) {
+  auto prefiller = createMockTextPrefiller(10);
+
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 0;
+
+  EXPECT_CALL(*prefiller, prefill_chunk(_, _, _)).Times(0);
+
+  EXPECT_EQ(
+      prefiller->prefill(prompt_tokens, start_pos, -0.1f).error(),
+      Error::InvalidArgument);
+  EXPECT_EQ(
+      prefiller->prefill(prompt_tokens, start_pos, 1.1f).error(),
+      Error::InvalidArgument);
+}
+
+TEST_F(TextPrefillerTest, TwoArgumentPrefillChunkOverrideStillDispatches) {
+  class LegacyPrefiller final : public TextPrefiller {
+   public:
+    explicit LegacyPrefiller(TextDecoderRunner* text_decoder_runner)
+        : TextPrefiller(text_decoder_runner, true, true, 10) {}
+
+    Result<uint64_t> prefill_chunk(std::vector<uint64_t>&, int64_t&) override {
+      called = true;
+      return Result<uint64_t>(42);
+    }
+
+    bool called = false;
+  };
+
+  LegacyPrefiller prefiller(&text_decoder_runner_);
+  TextPrefiller* base = &prefiller;
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 0;
+
+  auto result = base->prefill_chunk(prompt_tokens, start_pos);
+
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 42);
+  EXPECT_TRUE(prefiller.called);
+}
+
+TEST_F(TextPrefillerTest, ChunkedPrefillSamplesOnlyLastChunkWithTemperature) {
+  auto prefiller = createMockTextPrefiller(3);
+
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3, 4, 5, 6, 7, 8};
+  int64_t start_pos = 0;
+  constexpr float temperature = 0.9f;
+
+  {
+    InSequence seq;
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, FloatEq(0.0f)))
+        .WillOnce([](std::vector<uint64_t>&, int64_t&, float) {
+          return Result<uint64_t>(10);
+        });
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, FloatEq(0.0f)))
+        .WillOnce([](std::vector<uint64_t>&, int64_t&, float) {
+          return Result<uint64_t>(11);
+        });
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, FloatEq(temperature)))
+        .WillOnce([](std::vector<uint64_t>&, int64_t&, float) {
+          return Result<uint64_t>(12);
+        });
+  }
+
+  auto result = prefiller->prefill(prompt_tokens, start_pos, temperature);
+
+  EXPECT_EQ(result.error(), Error::Ok);
+  EXPECT_EQ(result.get(), 12);
+}
+
 // Test that prefill() calls prefill_chunk() multiple times when prompt tokens >
 // max_seq_len
 TEST_F(
@@ -217,14 +340,14 @@ TEST_F(TextPrefillerTest, PrefillHandlesPrefillChunkErrorsCorrectly) {
     InSequence seq;
 
     // First chunk: tokens [1, 2, 3] - succeeds
-    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
-        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos, float) {
           return Result<uint64_t>(10);
         });
 
     // Second chunk: tokens [4, 5] - fails
-    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
-        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos, float) {
           return Result<uint64_t>(Error::InvalidArgument);
         });
   }
@@ -236,6 +359,23 @@ TEST_F(TextPrefillerTest, PrefillHandlesPrefillChunkErrorsCorrectly) {
   EXPECT_EQ(result.error(), Error::InvalidArgument);
 }
 
+TEST_F(TextPrefillerTest, PrefillChunkRejectsTemperatureOutOfRange) {
+  auto prefiller = createTextPrefiller(10, true, true);
+
+  std::vector<uint64_t> prompt_tokens = {1, 2, 3};
+  int64_t start_pos = 0;
+
+  EXPECT_CALL(text_decoder_runner_, step(_, _)).Times(0);
+
+  EXPECT_EQ(
+      prefiller->prefill_chunk(prompt_tokens, start_pos, -0.1f).error(),
+      Error::InvalidArgument);
+  EXPECT_EQ(
+      prefiller->prefill_chunk(prompt_tokens, start_pos, 1.1f).error(),
+      Error::InvalidArgument);
+  EXPECT_EQ(start_pos, 0);
+}
+
 // Test that prefill_chunk() works correctly with parallel prefill enabled
 TEST_F(TextPrefillerTest, PrefillChunkWorksWithParallelPrefill) {
   // Create a TextPrefiller with parallel prefill enabled
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -29,7 +29,19 @@ TextPrefiller::TextPrefiller(
 ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     std::vector<uint64_t>& prompt_tokens,
     int64_t& start_pos) {
+  return prefill(prompt_tokens, start_pos, 0.0f);
+}
+
+::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
+    std::vector<uint64_t>& prompt_tokens,
+    int64_t& start_pos,
+    float temperature) {
   ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
+  ET_CHECK_OR_RETURN_ERROR(
+      temperature >= 0.0f && temperature <= 1.0f,
+      InvalidArgument,
+      "Temperature must be in [0, 1], got %f",
+      static_cast<double>(temperature));
   if (!text_decoder_runner_->is_method_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
   }
@@ -54,8 +66,14 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
               num_tokens_to_prefill_with,
           prompt_tokens_to_process.begin());
 
-      // Process this chunk
-      auto chunk_result = prefill_chunk(prompt_tokens_to_process, start_pos);
+      // Only the final chunk samples the first generated token.
+      const bool is_last_chunk =
+          num_tokens_to_process + num_tokens_to_prefill_with >=
+          num_prompt_tokens;
+      auto chunk_result = prefill_chunk(
+          prompt_tokens_to_process,
+          start_pos,
+          is_last_chunk ? temperature : 0.0f);
       ET_CHECK_OK_OR_RETURN_ERROR(chunk_result.error());
       cur_token = chunk_result.get();
 
@@ -65,13 +83,25 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     return cur_token;
   } else {
     // If prompt tokens don't exceed max_seq_len_, process them directly
-    return prefill_chunk(prompt_tokens, start_pos);
+    return prefill_chunk(prompt_tokens, start_pos, temperature);
   }
 }
 
 ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
     std::vector<uint64_t>& prompt_tokens,
     int64_t& start_pos) {
+  return prefill_chunk(prompt_tokens, start_pos, 0.0f);
+}
+
+::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
+    std::vector<uint64_t>& prompt_tokens,
+    int64_t& start_pos,
+    float temperature) {
+  ET_CHECK_OR_RETURN_ERROR(
+      temperature >= 0.0f && temperature <= 1.0f,
+      InvalidArgument,
+      "Temperature must be in [0, 1], got %f",
+      static_cast<double>(temperature));
   // enable_parallel_prefill_ maybe set even when not using kv cache
   // When kv cache is not used, start pos is ignored
   int32_t num_prompt_tokens = prompt_tokens.size();
@@ -92,7 +122,8 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
         Info, "Prefill token result numel(): %zu", outputs_res.get().numel());
 
     start_pos += num_prompt_tokens;
-    cur_token = text_decoder_runner_->logits_to_token(outputs_res.get());
+    cur_token =
+        text_decoder_runner_->logits_to_token(outputs_res.get(), temperature);
   } else { // sequential prefill
     int64_t pos = 0; // position in the sequence
     // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
@@ -128,7 +159,8 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
       start_pos++;
     }
 
-    cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
+    cur_token =
+        text_decoder_runner_->logits_to_token(logits_tensor, temperature);
   }
   return cur_token;
 }
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -32,23 +32,43 @@ class ET_EXPERIMENTAL TextPrefiller {
    * tokenizer.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * Module.
+   * Equivalent to `prefill(prompt_tokens, start_pos, 0.0f)`.
    * @return The next token of the LLM Module after prefill.
    */
   virtual ::executorch::runtime::Result<uint64_t> prefill(
       std::vector<uint64_t>& prompt_tokens,
       int64_t& start_pos);
 
+  /**
+   * Like `prefill(prompt_tokens, start_pos)`, but samples the first generated
+   * token with `temperature` in [0.0, 1.0].
+   */
+  virtual ::executorch::runtime::Result<uint64_t> prefill(
+      std::vector<uint64_t>& prompt_tokens,
+      int64_t& start_pos,
+      float temperature);
+
   /**
    * Helper method to prefill a chunk of tokens.
    * @param prompt_tokens The chunk of text prompt tokens to process.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * Module.
+   * Equivalent to `prefill_chunk(prompt_tokens, start_pos, 0.0f)`.
    * @return The next token of the LLM Module after prefilling this chunk.
    */
   virtual ::executorch::runtime::Result<uint64_t> prefill_chunk(
       std::vector<uint64_t>& prompt_tokens,
       int64_t& start_pos);
 
+  /**
+   * Like `prefill_chunk(prompt_tokens, start_pos)`, but samples the produced
+   * token with `temperature` in [0.0, 1.0].
+   */
+  virtual ::executorch::runtime::Result<uint64_t> prefill_chunk(
+      std::vector<uint64_t>& prompt_tokens,
+      int64_t& start_pos,
+      float temperature);
+
   /**
    * Load the necessary resources for the TextPrefiller.
    * This method should be called before using the prefill methods.
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h